You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ra...@apache.org on 2018/06/28 14:55:15 UTC
[47/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java
deleted file mode 100644
index 752bb48..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java
+++ /dev/null
@@ -1,274 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.email;
-
-import com.google.common.io.Closeables;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.filecache.DistributedCache;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.FileUtil;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.NullWritable;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
-import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
-import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.mahout.common.AbstractJob;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
-import org.apache.mahout.common.iterator.sequencefile.PathFilters;
-import org.apache.mahout.common.iterator.sequencefile.PathType;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
-import org.apache.mahout.math.VarIntWritable;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-import java.net.URI;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
-import java.util.concurrent.atomic.AtomicInteger;
-
-/**
- * Convert the Mail archives (see {@link org.apache.mahout.text.SequenceFilesFromMailArchives}) to a preference
- * file that can be consumed by the {@link org.apache.mahout.cf.taste.hadoop.item.RecommenderJob}.
- * <p/>
- * This assumes the input is a Sequence File, that the key is: filename/message id and the value is a list
- * (separated by the user's choosing) containing the from email and any references
- * <p/>
- * The output is a matrix where either the from or to are the rows (represented as longs) and the columns are the
- * message ids that the user has interacted with (as a VectorWritable). This class currently does not account for
- * thread hijacking.
- * <p/>
- * It also outputs a side table mapping the row ids to their original and the message ids to the message thread id
- */
-public final class MailToPrefsDriver extends AbstractJob {
-
- private static final Logger log = LoggerFactory.getLogger(MailToPrefsDriver.class);
-
- private static final String OUTPUT_FILES_PATTERN = "part-*";
- private static final int DICTIONARY_BYTE_OVERHEAD = 4;
-
- public static void main(String[] args) throws Exception {
- ToolRunner.run(new Configuration(), new MailToPrefsDriver(), args);
- }
-
- @Override
- public int run(String[] args) throws Exception {
- addInputOption();
- addOutputOption();
- addOption(DefaultOptionCreator.overwriteOption().create());
- addOption("chunkSize", "cs", "The size of chunks to write. Default is 100 mb", "100");
- addOption("separator", "sep", "The separator used in the input file to separate to, from, subject. Default is \\n",
- "\n");
- addOption("from", "f", "The position in the input text (value) where the from email is located, starting from "
- + "zero (0).", "0");
- addOption("refs", "r", "The position in the input text (value) where the reference ids are located, "
- + "starting from zero (0).", "1");
- addOption(buildOption("useCounts", "u", "If set, then use the number of times the user has interacted with a "
- + "thread as an indication of their preference. Otherwise, use boolean preferences.", false, false,
- String.valueOf(true)));
- Map<String, List<String>> parsedArgs = parseArguments(args);
-
- Path input = getInputPath();
- Path output = getOutputPath();
- int chunkSize = Integer.parseInt(getOption("chunkSize"));
- String separator = getOption("separator");
- Configuration conf = getConf();
- boolean useCounts = hasOption("useCounts");
- AtomicInteger currentPhase = new AtomicInteger();
- int[] msgDim = new int[1];
- //TODO: mod this to not do so many passes over the data. Dictionary creation could probably be a chain mapper
- List<Path> msgIdChunks = null;
- boolean overwrite = hasOption(DefaultOptionCreator.OVERWRITE_OPTION);
- // create the dictionary between message ids and longs
- if (shouldRunNextPhase(parsedArgs, currentPhase)) {
- //TODO: there seems to be a pattern emerging for dictionary creation
- // -- sparse vectors from seq files also has this.
- Path msgIdsPath = new Path(output, "msgIds");
- if (overwrite) {
- HadoopUtil.delete(conf, msgIdsPath);
- }
- log.info("Creating Msg Id Dictionary");
- Job createMsgIdDictionary = prepareJob(input,
- msgIdsPath,
- SequenceFileInputFormat.class,
- MsgIdToDictionaryMapper.class,
- Text.class,
- VarIntWritable.class,
- MailToDictionaryReducer.class,
- Text.class,
- VarIntWritable.class,
- SequenceFileOutputFormat.class);
-
- boolean succeeded = createMsgIdDictionary.waitForCompletion(true);
- if (!succeeded) {
- return -1;
- }
- //write out the dictionary at the top level
- msgIdChunks = createDictionaryChunks(msgIdsPath, output, "msgIds-dictionary-",
- createMsgIdDictionary.getConfiguration(), chunkSize, msgDim);
- }
- //create the dictionary between from email addresses and longs
- List<Path> fromChunks = null;
- if (shouldRunNextPhase(parsedArgs, currentPhase)) {
- Path fromIdsPath = new Path(output, "fromIds");
- if (overwrite) {
- HadoopUtil.delete(conf, fromIdsPath);
- }
- log.info("Creating From Id Dictionary");
- Job createFromIdDictionary = prepareJob(input,
- fromIdsPath,
- SequenceFileInputFormat.class,
- FromEmailToDictionaryMapper.class,
- Text.class,
- VarIntWritable.class,
- MailToDictionaryReducer.class,
- Text.class,
- VarIntWritable.class,
- SequenceFileOutputFormat.class);
- createFromIdDictionary.getConfiguration().set(EmailUtility.SEPARATOR, separator);
- boolean succeeded = createFromIdDictionary.waitForCompletion(true);
- if (!succeeded) {
- return -1;
- }
- //write out the dictionary at the top level
- int[] fromDim = new int[1];
- fromChunks = createDictionaryChunks(fromIdsPath, output, "fromIds-dictionary-",
- createFromIdDictionary.getConfiguration(), chunkSize, fromDim);
- }
- //OK, we have our dictionaries, let's output the real thing we need: <from_id -> <msgId, msgId, msgId, ...>>
- if (shouldRunNextPhase(parsedArgs, currentPhase) && fromChunks != null && msgIdChunks != null) {
- //Job map
- //may be a way to do this so that we can load the from ids in memory, if they are small enough so that
- // we don't need the double loop
- log.info("Creating recommendation matrix");
- Path vecPath = new Path(output, "recInput");
- if (overwrite) {
- HadoopUtil.delete(conf, vecPath);
- }
- //conf.set(EmailUtility.FROM_DIMENSION, String.valueOf(fromDim[0]));
- conf.set(EmailUtility.MSG_ID_DIMENSION, String.valueOf(msgDim[0]));
- conf.set(EmailUtility.FROM_PREFIX, "fromIds-dictionary-");
- conf.set(EmailUtility.MSG_IDS_PREFIX, "msgIds-dictionary-");
- conf.set(EmailUtility.FROM_INDEX, getOption("from"));
- conf.set(EmailUtility.REFS_INDEX, getOption("refs"));
- conf.set(EmailUtility.SEPARATOR, separator);
- conf.set(MailToRecReducer.USE_COUNTS_PREFERENCE, String.valueOf(useCounts));
- int j = 0;
- int i = 0;
- for (Path fromChunk : fromChunks) {
- for (Path idChunk : msgIdChunks) {
- Path out = new Path(vecPath, "tmp-" + i + '-' + j);
- DistributedCache.setCacheFiles(new URI[]{fromChunk.toUri(), idChunk.toUri()}, conf);
- Job createRecMatrix = prepareJob(input, out, SequenceFileInputFormat.class,
- MailToRecMapper.class, Text.class, LongWritable.class, MailToRecReducer.class, Text.class,
- NullWritable.class, TextOutputFormat.class);
- createRecMatrix.getConfiguration().set("mapred.output.compress", "false");
- boolean succeeded = createRecMatrix.waitForCompletion(true);
- if (!succeeded) {
- return -1;
- }
- //copy the results up a level
- //HadoopUtil.copyMergeSeqFiles(out.getFileSystem(conf), out, vecPath.getFileSystem(conf), outPath, true,
- // conf, "");
- FileStatus[] fs = HadoopUtil.getFileStatus(new Path(out, "*"), PathType.GLOB, PathFilters.partFilter(), null,
- conf);
- for (int k = 0; k < fs.length; k++) {
- FileStatus f = fs[k];
- Path outPath = new Path(vecPath, "chunk-" + i + '-' + j + '-' + k);
- FileUtil.copy(f.getPath().getFileSystem(conf), f.getPath(), outPath.getFileSystem(conf), outPath, true,
- overwrite, conf);
- }
- HadoopUtil.delete(conf, out);
- j++;
- }
- i++;
- }
- //concat the files together
- /*Path mergePath = new Path(output, "vectors.dat");
- if (overwrite) {
- HadoopUtil.delete(conf, mergePath);
- }
- log.info("Merging together output vectors to vectors.dat in {}", output);*/
- //HadoopUtil.copyMergeSeqFiles(vecPath.getFileSystem(conf), vecPath, mergePath.getFileSystem(conf), mergePath,
- // false, conf, "\n");
- }
-
- return 0;
- }
-
- private static List<Path> createDictionaryChunks(Path inputPath,
- Path dictionaryPathBase,
- String name,
- Configuration baseConf,
- int chunkSizeInMegabytes, int[] maxTermDimension)
- throws IOException {
- List<Path> chunkPaths = new ArrayList<>();
-
- Configuration conf = new Configuration(baseConf);
-
- FileSystem fs = FileSystem.get(inputPath.toUri(), conf);
-
- long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L;
- int chunkIndex = 0;
- Path chunkPath = new Path(dictionaryPathBase, name + chunkIndex);
- chunkPaths.add(chunkPath);
-
- SequenceFile.Writer dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class, IntWritable.class);
-
- try {
- long currentChunkSize = 0;
- Path filesPattern = new Path(inputPath, OUTPUT_FILES_PATTERN);
- int i = 1; //start at 1, since a miss in the OpenObjectIntHashMap returns a 0
- for (Pair<Writable, Writable> record
- : new SequenceFileDirIterable<>(filesPattern, PathType.GLOB, null, null, true, conf)) {
- if (currentChunkSize > chunkSizeLimit) {
- Closeables.close(dictWriter, false);
- chunkIndex++;
-
- chunkPath = new Path(dictionaryPathBase, name + chunkIndex);
- chunkPaths.add(chunkPath);
-
- dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class, IntWritable.class);
- currentChunkSize = 0;
- }
-
- Writable key = record.getFirst();
- int fieldSize = DICTIONARY_BYTE_OVERHEAD + key.toString().length() * 2 + Integer.SIZE / 8;
- currentChunkSize += fieldSize;
- dictWriter.append(key, new IntWritable(i++));
- }
- maxTermDimension[0] = i;
- } finally {
- Closeables.close(dictWriter, false);
- }
-
- return chunkPaths;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java
deleted file mode 100644
index 91bbd17..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java
+++ /dev/null
@@ -1,101 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.email;
-
-import org.apache.commons.lang3.StringUtils;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.mahout.math.map.OpenObjectIntHashMap;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-
-public final class MailToRecMapper extends Mapper<Text, Text, Text, LongWritable> {
-
- private static final Logger log = LoggerFactory.getLogger(MailToRecMapper.class);
-
- private final OpenObjectIntHashMap<String> fromDictionary = new OpenObjectIntHashMap<>();
- private final OpenObjectIntHashMap<String> msgIdDictionary = new OpenObjectIntHashMap<>();
- private String separator = "\n";
- private int fromIdx;
- private int refsIdx;
-
- public enum Counters {
- REFERENCE, ORIGINAL
- }
-
- @Override
- protected void setup(Context context) throws IOException, InterruptedException {
- super.setup(context);
- Configuration conf = context.getConfiguration();
- String fromPrefix = conf.get(EmailUtility.FROM_PREFIX);
- String msgPrefix = conf.get(EmailUtility.MSG_IDS_PREFIX);
- fromIdx = conf.getInt(EmailUtility.FROM_INDEX, 0);
- refsIdx = conf.getInt(EmailUtility.REFS_INDEX, 1);
- EmailUtility.loadDictionaries(conf, fromPrefix, fromDictionary, msgPrefix, msgIdDictionary);
- log.info("From Dictionary size: {} Msg Id Dictionary size: {}", fromDictionary.size(), msgIdDictionary.size());
- separator = context.getConfiguration().get(EmailUtility.SEPARATOR);
- }
-
- @Override
- protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
-
- int msgIdKey = Integer.MIN_VALUE;
-
-
- int fromKey = Integer.MIN_VALUE;
- String valStr = value.toString();
- String[] splits = StringUtils.splitByWholeSeparatorPreserveAllTokens(valStr, separator);
-
- if (splits != null && splits.length > 0) {
- if (splits.length > refsIdx) {
- String from = EmailUtility.cleanUpEmailAddress(splits[fromIdx]);
- fromKey = fromDictionary.get(from);
- }
- //get the references
- if (splits.length > refsIdx) {
- String[] theRefs = EmailUtility.parseReferences(splits[refsIdx]);
- if (theRefs != null && theRefs.length > 0) {
- //we have a reference, the first one is the original message id, so map to that one if it exists
- msgIdKey = msgIdDictionary.get(theRefs[0]);
- context.getCounter(Counters.REFERENCE).increment(1);
- }
- }
- }
- //we don't have any references, so use the msg id
- if (msgIdKey == Integer.MIN_VALUE) {
- //get the msg id and the from and output the associated ids
- String keyStr = key.toString();
- int idx = keyStr.lastIndexOf('/');
- if (idx != -1) {
- String msgId = keyStr.substring(idx + 1);
- msgIdKey = msgIdDictionary.get(msgId);
- context.getCounter(Counters.ORIGINAL).increment(1);
- }
- }
-
- if (msgIdKey != Integer.MIN_VALUE && fromKey != Integer.MIN_VALUE) {
- context.write(new Text(fromKey + "," + msgIdKey), new LongWritable(1));
- }
- }
-
-
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecReducer.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecReducer.java
deleted file mode 100644
index ee36a41..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecReducer.java
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.email;
-
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.NullWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Reducer;
-
-import java.io.IOException;
-
-public class MailToRecReducer extends Reducer<Text, LongWritable, Text, NullWritable> {
- //if true, then output weight
- private boolean useCounts = true;
- /**
- * We can either ignore how many times the user interacted (boolean) or output the number of times they interacted.
- */
- public static final String USE_COUNTS_PREFERENCE = "useBooleanPreferences";
-
- @Override
- protected void setup(Context context) throws IOException, InterruptedException {
- useCounts = context.getConfiguration().getBoolean(USE_COUNTS_PREFERENCE, true);
- }
-
- @Override
- protected void reduce(Text key, Iterable<LongWritable> values, Context context)
- throws IOException, InterruptedException {
- if (useCounts) {
- long sum = 0;
- for (LongWritable value : values) {
- sum++;
- }
- context.write(new Text(key.toString() + ',' + sum), null);
- } else {
- context.write(new Text(key.toString()), null);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MsgIdToDictionaryMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MsgIdToDictionaryMapper.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MsgIdToDictionaryMapper.java
deleted file mode 100644
index f3de847..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MsgIdToDictionaryMapper.java
+++ /dev/null
@@ -1,49 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.email;
-
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.mahout.math.VarIntWritable;
-
-import java.io.IOException;
-
-/**
- * Assumes the input is in the format created by {@link org.apache.mahout.text.SequenceFilesFromMailArchives}
- */
-public final class MsgIdToDictionaryMapper extends Mapper<Text, Text, Text, VarIntWritable> {
-
- @Override
- protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
- //message id is in the key: /201008/AANLkTikvVnhNH+Y5AGEwqd2=u0CFv2mCm0ce6E6oBnj1@mail.gmail.com
- String keyStr = key.toString();
- int idx = keyStr.lastIndexOf('@'); //find the last @
- if (idx == -1) {
- context.getCounter(EmailUtility.Counters.NO_MESSAGE_ID).increment(1);
- } else {
- //found the @, now find the last slash before the @ and grab everything after that
- idx = keyStr.lastIndexOf('/', idx);
- String msgId = keyStr.substring(idx + 1);
- if (EmailUtility.WHITESPACE.matcher(msgId).matches()) {
- context.getCounter(EmailUtility.Counters.NO_MESSAGE_ID).increment(1);
- } else {
- context.write(new Text(msgId), new VarIntWritable(1));
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/DataFileIterable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/DataFileIterable.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/DataFileIterable.java
deleted file mode 100644
index c358021..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/DataFileIterable.java
+++ /dev/null
@@ -1,44 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.Iterator;
-
-import org.apache.mahout.cf.taste.model.PreferenceArray;
-import org.apache.mahout.common.Pair;
-
-public final class DataFileIterable implements Iterable<Pair<PreferenceArray,long[]>> {
-
- private final File dataFile;
-
- public DataFileIterable(File dataFile) {
- this.dataFile = dataFile;
- }
-
- @Override
- public Iterator<Pair<PreferenceArray, long[]>> iterator() {
- try {
- return new DataFileIterator(dataFile);
- } catch (IOException ioe) {
- throw new IllegalStateException(ioe);
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/DataFileIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/DataFileIterator.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/DataFileIterator.java
deleted file mode 100644
index 786e080..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/DataFileIterator.java
+++ /dev/null
@@ -1,158 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup;
-
-import java.io.Closeable;
-import java.io.File;
-import java.io.IOException;
-import java.util.regex.Pattern;
-
-import com.google.common.collect.AbstractIterator;
-import com.google.common.io.Closeables;
-import org.apache.mahout.cf.taste.impl.common.SkippingIterator;
-import org.apache.mahout.cf.taste.impl.model.GenericUserPreferenceArray;
-import org.apache.mahout.cf.taste.model.PreferenceArray;
-import org.apache.mahout.common.iterator.FileLineIterator;
-import org.apache.mahout.common.Pair;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * <p>An {@link java.util.Iterator} which iterates over any of the KDD Cup's rating files. These include the files
- * {train,test,validation}Idx{1,2}}.txt. See http://kddcup.yahoo.com/. Each element in the iteration corresponds
- * to one user's ratings as a {@link PreferenceArray} and corresponding timestamps as a parallel {@code long}
- * array.</p>
- *
- * <p>Timestamps in the data set are relative to some unknown point in time, for anonymity. They are assumed
- * to be relative to the epoch, time 0, or January 1 1970, for purposes here.</p>
- */
-public final class DataFileIterator
- extends AbstractIterator<Pair<PreferenceArray,long[]>>
- implements SkippingIterator<Pair<PreferenceArray,long[]>>, Closeable {
-
- private static final Pattern COLON_PATTERN = Pattern.compile(":");
- private static final Pattern PIPE_PATTERN = Pattern.compile("\\|");
- private static final Pattern TAB_PATTERN = Pattern.compile("\t");
-
- private final FileLineIterator lineIterator;
-
- private static final Logger log = LoggerFactory.getLogger(DataFileIterator.class);
-
- public DataFileIterator(File dataFile) throws IOException {
- if (dataFile == null || dataFile.isDirectory() || !dataFile.exists()) {
- throw new IllegalArgumentException("Bad data file: " + dataFile);
- }
- lineIterator = new FileLineIterator(dataFile);
- }
-
- @Override
- protected Pair<PreferenceArray, long[]> computeNext() {
-
- if (!lineIterator.hasNext()) {
- return endOfData();
- }
-
- String line = lineIterator.next();
- // First a userID|ratingsCount line
- String[] tokens = PIPE_PATTERN.split(line);
-
- long userID = Long.parseLong(tokens[0]);
- int ratingsLeftToRead = Integer.parseInt(tokens[1]);
- int ratingsRead = 0;
-
- PreferenceArray currentUserPrefs = new GenericUserPreferenceArray(ratingsLeftToRead);
- long[] timestamps = new long[ratingsLeftToRead];
-
- while (ratingsLeftToRead > 0) {
-
- line = lineIterator.next();
-
- // Then a data line. May be 1-4 tokens depending on whether preference info is included (it's not in test data)
- // or whether date info is included (not inluded in track 2). Item ID is always first, and date is the last
- // two fields if it exists.
- tokens = TAB_PATTERN.split(line);
- boolean hasPref = tokens.length == 2 || tokens.length == 4;
- boolean hasDate = tokens.length > 2;
-
- long itemID = Long.parseLong(tokens[0]);
-
- currentUserPrefs.setUserID(0, userID);
- currentUserPrefs.setItemID(ratingsRead, itemID);
- if (hasPref) {
- float preference = Float.parseFloat(tokens[1]);
- currentUserPrefs.setValue(ratingsRead, preference);
- }
-
- if (hasDate) {
- long timestamp;
- if (hasPref) {
- timestamp = parseFakeTimestamp(tokens[2], tokens[3]);
- } else {
- timestamp = parseFakeTimestamp(tokens[1], tokens[2]);
- }
- timestamps[ratingsRead] = timestamp;
- }
-
- ratingsRead++;
- ratingsLeftToRead--;
- }
-
- return new Pair<>(currentUserPrefs, timestamps);
- }
-
- @Override
- public void skip(int n) {
- for (int i = 0; i < n; i++) {
- if (lineIterator.hasNext()) {
- String line = lineIterator.next();
- // First a userID|ratingsCount line
- String[] tokens = PIPE_PATTERN.split(line);
- int linesToSKip = Integer.parseInt(tokens[1]);
- lineIterator.skip(linesToSKip);
- } else {
- break;
- }
- }
- }
-
- @Override
- public void close() {
- endOfData();
- try {
- Closeables.close(lineIterator, true);
- } catch (IOException e) {
- log.error(e.getMessage(), e);
- }
- }
-
- /**
- * @param dateString "date" in days since some undisclosed date, which we will arbitrarily assume to be the
- * epoch, January 1 1970.
- * @param timeString time of day in HH:mm:ss format
- * @return the UNIX timestamp for this moment in time
- */
- private static long parseFakeTimestamp(String dateString, CharSequence timeString) {
- int days = Integer.parseInt(dateString);
- String[] timeTokens = COLON_PATTERN.split(timeString);
- int hours = Integer.parseInt(timeTokens[0]);
- int minutes = Integer.parseInt(timeTokens[1]);
- int seconds = Integer.parseInt(timeTokens[2]);
- return 86400L * days + 3600L + hours + 60L * minutes + seconds;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/KDDCupDataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/KDDCupDataModel.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/KDDCupDataModel.java
deleted file mode 100644
index 4b62050..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/KDDCupDataModel.java
+++ /dev/null
@@ -1,231 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.Collection;
-import java.util.Iterator;
-
-import com.google.common.base.Preconditions;
-import org.apache.mahout.cf.taste.common.Refreshable;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
-import org.apache.mahout.cf.taste.impl.common.FastIDSet;
-import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
-import org.apache.mahout.cf.taste.impl.model.GenericDataModel;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.model.PreferenceArray;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.iterator.SamplingIterator;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * <p>An {@link DataModel} which reads into memory any of the KDD Cup's rating files; it is really
- * meant for use with training data in the files trainIdx{1,2}}.txt.
- * See http://kddcup.yahoo.com/.</p>
- *
- * <p>Timestamps in the data set are relative to some unknown point in time, for anonymity. They are assumed
- * to be relative to the epoch, time 0, or January 1 1970, for purposes here.</p>
- */
-public final class KDDCupDataModel implements DataModel {
-
- private static final Logger log = LoggerFactory.getLogger(KDDCupDataModel.class);
-
- private final File dataFileDirectory;
- private final DataModel delegate;
-
- /**
- * @param dataFile training rating file
- */
- public KDDCupDataModel(File dataFile) throws IOException {
- this(dataFile, false, 1.0);
- }
-
- /**
- * @param dataFile training rating file
- * @param storeDates if true, dates are parsed and stored, otherwise not
- * @param samplingRate percentage of users to keep; can be used to reduce memory requirements
- */
- public KDDCupDataModel(File dataFile, boolean storeDates, double samplingRate) throws IOException {
-
- Preconditions.checkArgument(!Double.isNaN(samplingRate) && samplingRate > 0.0 && samplingRate <= 1.0,
- "Must be: 0.0 < samplingRate <= 1.0");
-
- dataFileDirectory = dataFile.getParentFile();
-
- Iterator<Pair<PreferenceArray,long[]>> dataIterator = new DataFileIterator(dataFile);
- if (samplingRate < 1.0) {
- dataIterator = new SamplingIterator<>(dataIterator, samplingRate);
- }
-
- FastByIDMap<PreferenceArray> userData = new FastByIDMap<>();
- FastByIDMap<FastByIDMap<Long>> timestamps = new FastByIDMap<>();
-
- while (dataIterator.hasNext()) {
-
- Pair<PreferenceArray,long[]> pair = dataIterator.next();
- PreferenceArray userPrefs = pair.getFirst();
- long[] timestampsForPrefs = pair.getSecond();
-
- userData.put(userPrefs.getUserID(0), userPrefs);
- if (storeDates) {
- FastByIDMap<Long> itemTimestamps = new FastByIDMap<>();
- for (int i = 0; i < timestampsForPrefs.length; i++) {
- long timestamp = timestampsForPrefs[i];
- if (timestamp > 0L) {
- itemTimestamps.put(userPrefs.getItemID(i), timestamp);
- }
- }
- }
-
- }
-
- if (storeDates) {
- delegate = new GenericDataModel(userData, timestamps);
- } else {
- delegate = new GenericDataModel(userData);
- }
-
- Runtime runtime = Runtime.getRuntime();
- log.info("Loaded data model in about {}MB heap", (runtime.totalMemory() - runtime.freeMemory()) / 1000000);
- }
-
- public File getDataFileDirectory() {
- return dataFileDirectory;
- }
-
- public static File getTrainingFile(File dataFileDirectory) {
- return getFile(dataFileDirectory, "trainIdx");
- }
-
- public static File getValidationFile(File dataFileDirectory) {
- return getFile(dataFileDirectory, "validationIdx");
- }
-
- public static File getTestFile(File dataFileDirectory) {
- return getFile(dataFileDirectory, "testIdx");
- }
-
- public static File getTrackFile(File dataFileDirectory) {
- return getFile(dataFileDirectory, "trackData");
- }
-
- private static File getFile(File dataFileDirectory, String prefix) {
- // Works on set 1 or 2
- for (int set : new int[] {1,2}) {
- // Works on sample data from before contest or real data
- for (String firstLinesOrNot : new String[] {"", ".firstLines"}) {
- for (String gzippedOrNot : new String[] {".gz", ""}) {
- File dataFile = new File(dataFileDirectory, prefix + set + firstLinesOrNot + ".txt" + gzippedOrNot);
- if (dataFile.exists()) {
- return dataFile;
- }
- }
- }
- }
- throw new IllegalArgumentException("Can't find " + prefix + " file in " + dataFileDirectory);
- }
-
- @Override
- public LongPrimitiveIterator getUserIDs() throws TasteException {
- return delegate.getUserIDs();
- }
-
- @Override
- public PreferenceArray getPreferencesFromUser(long userID) throws TasteException {
- return delegate.getPreferencesFromUser(userID);
- }
-
- @Override
- public FastIDSet getItemIDsFromUser(long userID) throws TasteException {
- return delegate.getItemIDsFromUser(userID);
- }
-
- @Override
- public LongPrimitiveIterator getItemIDs() throws TasteException {
- return delegate.getItemIDs();
- }
-
- @Override
- public PreferenceArray getPreferencesForItem(long itemID) throws TasteException {
- return delegate.getPreferencesForItem(itemID);
- }
-
- @Override
- public Float getPreferenceValue(long userID, long itemID) throws TasteException {
- return delegate.getPreferenceValue(userID, itemID);
- }
-
- @Override
- public Long getPreferenceTime(long userID, long itemID) throws TasteException {
- return delegate.getPreferenceTime(userID, itemID);
- }
-
- @Override
- public int getNumItems() throws TasteException {
- return delegate.getNumItems();
- }
-
- @Override
- public int getNumUsers() throws TasteException {
- return delegate.getNumUsers();
- }
-
- @Override
- public int getNumUsersWithPreferenceFor(long itemID) throws TasteException {
- return delegate.getNumUsersWithPreferenceFor(itemID);
- }
-
- @Override
- public int getNumUsersWithPreferenceFor(long itemID1, long itemID2) throws TasteException {
- return delegate.getNumUsersWithPreferenceFor(itemID1, itemID2);
- }
-
- @Override
- public void setPreference(long userID, long itemID, float value) throws TasteException {
- delegate.setPreference(userID, itemID, value);
- }
-
- @Override
- public void removePreference(long userID, long itemID) throws TasteException {
- delegate.removePreference(userID, itemID);
- }
-
- @Override
- public boolean hasPreferenceValues() {
- return delegate.hasPreferenceValues();
- }
-
- @Override
- public float getMaxPreference() {
- return 100.0f;
- }
-
- @Override
- public float getMinPreference() {
- return 0.0f;
- }
-
- @Override
- public void refresh(Collection<Refreshable> alreadyRefreshed) {
- // do nothing
- }
-
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/ToCSV.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/ToCSV.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/ToCSV.java
deleted file mode 100644
index 3f4a732..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/ToCSV.java
+++ /dev/null
@@ -1,77 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup;
-
-import org.apache.commons.io.Charsets;
-import org.apache.mahout.cf.taste.model.PreferenceArray;
-import org.apache.mahout.common.Pair;
-
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.OutputStream;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import java.util.zip.GZIPOutputStream;
-
-/**
- * <p>This class converts a KDD Cup input file into a compressed CSV format. The output format is
- * {@code userID,itemID,score,timestamp}. It can optionally restrict its output to exclude
- * score and/or timestamp.</p>
- *
- * <p>Run as: {@code ToCSV (input file) (output file) [num columns to output]}</p>
- */
-public final class ToCSV {
-
- private ToCSV() {
- }
-
- public static void main(String[] args) throws Exception {
-
- File inputFile = new File(args[0]);
- File outputFile = new File(args[1]);
- int columnsToOutput = 4;
- if (args.length >= 3) {
- columnsToOutput = Integer.parseInt(args[2]);
- }
-
- OutputStream outStream = new GZIPOutputStream(new FileOutputStream(outputFile));
-
- try (Writer outWriter = new BufferedWriter(new OutputStreamWriter(outStream, Charsets.UTF_8))){
- for (Pair<PreferenceArray,long[]> user : new DataFileIterable(inputFile)) {
- PreferenceArray prefs = user.getFirst();
- long[] timestamps = user.getSecond();
- for (int i = 0; i < prefs.length(); i++) {
- outWriter.write(String.valueOf(prefs.getUserID(i)));
- outWriter.write(',');
- outWriter.write(String.valueOf(prefs.getItemID(i)));
- if (columnsToOutput > 2) {
- outWriter.write(',');
- outWriter.write(String.valueOf(prefs.getValue(i)));
- }
- if (columnsToOutput > 3) {
- outWriter.write(',');
- outWriter.write(String.valueOf(timestamps[i]));
- }
- outWriter.write('\n');
- }
- }
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/EstimateConverter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/EstimateConverter.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/EstimateConverter.java
deleted file mode 100644
index 0112ab9..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/EstimateConverter.java
+++ /dev/null
@@ -1,43 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track1;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public final class EstimateConverter {
-
- private static final Logger log = LoggerFactory.getLogger(EstimateConverter.class);
-
- private EstimateConverter() {}
-
- public static byte convert(double estimate, long userID, long itemID) {
- if (Double.isNaN(estimate)) {
- log.warn("Unable to compute estimate for user {}, item {}", userID, itemID);
- return 0x7F;
- } else {
- int scaledEstimate = (int) (estimate * 2.55);
- if (scaledEstimate > 255) {
- scaledEstimate = 255;
- } else if (scaledEstimate < 0) {
- scaledEstimate = 0;
- }
- return (byte) scaledEstimate;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Callable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Callable.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Callable.java
deleted file mode 100644
index 72056da..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Callable.java
+++ /dev/null
@@ -1,67 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track1;
-
-import java.util.concurrent.Callable;
-import java.util.concurrent.atomic.AtomicInteger;
-
-import org.apache.mahout.cf.taste.common.NoSuchItemException;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.model.PreferenceArray;
-import org.apache.mahout.cf.taste.recommender.Recommender;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-final class Track1Callable implements Callable<byte[]> {
-
- private static final Logger log = LoggerFactory.getLogger(Track1Callable.class);
- private static final AtomicInteger COUNT = new AtomicInteger();
-
- private final Recommender recommender;
- private final PreferenceArray userTest;
-
- Track1Callable(Recommender recommender, PreferenceArray userTest) {
- this.recommender = recommender;
- this.userTest = userTest;
- }
-
- @Override
- public byte[] call() throws TasteException {
- long userID = userTest.get(0).getUserID();
- byte[] result = new byte[userTest.length()];
- for (int i = 0; i < userTest.length(); i++) {
- long itemID = userTest.getItemID(i);
- double estimate;
- try {
- estimate = recommender.estimatePreference(userID, itemID);
- } catch (NoSuchItemException nsie) {
- // OK in the sample data provided before the contest, should never happen otherwise
- log.warn("Unknown item {}; OK unless this is the real contest data", itemID);
- continue;
- }
- result[i] = EstimateConverter.convert(estimate, userID, itemID);
- }
-
- if (COUNT.incrementAndGet() % 10000 == 0) {
- log.info("Completed {} users", COUNT.get());
- }
-
- return result;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Recommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Recommender.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Recommender.java
deleted file mode 100644
index 067daf5..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Recommender.java
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track1;
-
-import java.util.Collection;
-import java.util.List;
-
-import org.apache.mahout.cf.taste.common.Refreshable;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.impl.recommender.GenericItemBasedRecommender;
-import org.apache.mahout.cf.taste.impl.similarity.UncenteredCosineSimilarity;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.recommender.IDRescorer;
-import org.apache.mahout.cf.taste.recommender.RecommendedItem;
-import org.apache.mahout.cf.taste.recommender.Recommender;
-import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
-
-public final class Track1Recommender implements Recommender {
-
- private final Recommender recommender;
-
- public Track1Recommender(DataModel dataModel) throws TasteException {
- // Change this to whatever you like!
- ItemSimilarity similarity = new UncenteredCosineSimilarity(dataModel);
- recommender = new GenericItemBasedRecommender(dataModel, similarity);
- }
-
- @Override
- public List<RecommendedItem> recommend(long userID, int howMany) throws TasteException {
- return recommender.recommend(userID, howMany);
- }
-
- @Override
- public List<RecommendedItem> recommend(long userID, int howMany, boolean includeKnownItems) throws TasteException {
- return recommend(userID, howMany, null, includeKnownItems);
- }
-
- @Override
- public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer) throws TasteException {
- return recommender.recommend(userID, howMany, rescorer, false);
- }
-
- @Override
- public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer, boolean includeKnownItems)
- throws TasteException {
- return recommender.recommend(userID, howMany, rescorer, includeKnownItems);
- }
-
- @Override
- public float estimatePreference(long userID, long itemID) throws TasteException {
- return recommender.estimatePreference(userID, itemID);
- }
-
- @Override
- public void setPreference(long userID, long itemID, float value) throws TasteException {
- recommender.setPreference(userID, itemID, value);
- }
-
- @Override
- public void removePreference(long userID, long itemID) throws TasteException {
- recommender.removePreference(userID, itemID);
- }
-
- @Override
- public DataModel getDataModel() {
- return recommender.getDataModel();
- }
-
- @Override
- public void refresh(Collection<Refreshable> alreadyRefreshed) {
- recommender.refresh(alreadyRefreshed);
- }
-
- @Override
- public String toString() {
- return "Track1Recommender[recommender:" + recommender + ']';
- }
-
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderBuilder.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderBuilder.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderBuilder.java
deleted file mode 100644
index 6b9fe1b..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderBuilder.java
+++ /dev/null
@@ -1,32 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track1;
-
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.eval.RecommenderBuilder;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.recommender.Recommender;
-
-final class Track1RecommenderBuilder implements RecommenderBuilder {
-
- @Override
- public Recommender buildRecommender(DataModel dataModel) throws TasteException {
- return new Track1Recommender(dataModel);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderEvaluator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderEvaluator.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderEvaluator.java
deleted file mode 100644
index bcd0a3d..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderEvaluator.java
+++ /dev/null
@@ -1,108 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track1;
-
-import java.io.File;
-import java.util.Collection;
-import java.util.concurrent.Callable;
-import java.util.concurrent.atomic.AtomicInteger;
-
-import com.google.common.collect.Lists;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.eval.DataModelBuilder;
-import org.apache.mahout.cf.taste.eval.RecommenderBuilder;
-import org.apache.mahout.cf.taste.example.kddcup.DataFileIterable;
-import org.apache.mahout.cf.taste.example.kddcup.KDDCupDataModel;
-import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
-import org.apache.mahout.cf.taste.impl.common.FullRunningAverageAndStdDev;
-import org.apache.mahout.cf.taste.impl.common.RunningAverage;
-import org.apache.mahout.cf.taste.impl.common.RunningAverageAndStdDev;
-import org.apache.mahout.cf.taste.impl.eval.AbstractDifferenceRecommenderEvaluator;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.model.Preference;
-import org.apache.mahout.cf.taste.model.PreferenceArray;
-import org.apache.mahout.cf.taste.recommender.Recommender;
-import org.apache.mahout.common.Pair;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Attempts to run an evaluation just like that dictated for Yahoo's KDD Cup, Track 1.
- * It will compute the RMSE of a validation data set against the predicted ratings from
- * the training data set.
- */
-public final class Track1RecommenderEvaluator extends AbstractDifferenceRecommenderEvaluator {
-
- private static final Logger log = LoggerFactory.getLogger(Track1RecommenderEvaluator.class);
-
- private RunningAverage average;
- private final File dataFileDirectory;
-
- public Track1RecommenderEvaluator(File dataFileDirectory) {
- setMaxPreference(100.0f);
- setMinPreference(0.0f);
- average = new FullRunningAverage();
- this.dataFileDirectory = dataFileDirectory;
- }
-
- @Override
- public double evaluate(RecommenderBuilder recommenderBuilder,
- DataModelBuilder dataModelBuilder,
- DataModel dataModel,
- double trainingPercentage,
- double evaluationPercentage) throws TasteException {
-
- Recommender recommender = recommenderBuilder.buildRecommender(dataModel);
-
- Collection<Callable<Void>> estimateCallables = Lists.newArrayList();
- AtomicInteger noEstimateCounter = new AtomicInteger();
- for (Pair<PreferenceArray,long[]> userData
- : new DataFileIterable(KDDCupDataModel.getValidationFile(dataFileDirectory))) {
- PreferenceArray validationPrefs = userData.getFirst();
- long userID = validationPrefs.get(0).getUserID();
- estimateCallables.add(
- new PreferenceEstimateCallable(recommender, userID, validationPrefs, noEstimateCounter));
- }
-
- RunningAverageAndStdDev timing = new FullRunningAverageAndStdDev();
- execute(estimateCallables, noEstimateCounter, timing);
-
- double result = computeFinalEvaluation();
- log.info("Evaluation result: {}", result);
- return result;
- }
-
- // Use RMSE scoring:
-
- @Override
- protected void reset() {
- average = new FullRunningAverage();
- }
-
- @Override
- protected void processOneEstimate(float estimatedPreference, Preference realPref) {
- double diff = realPref.getValue() - estimatedPreference;
- average.addDatum(diff * diff);
- }
-
- @Override
- protected double computeFinalEvaluation() {
- return Math.sqrt(average.getAverage());
- }
-
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderEvaluatorRunner.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderEvaluatorRunner.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderEvaluatorRunner.java
deleted file mode 100644
index deadc00..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderEvaluatorRunner.java
+++ /dev/null
@@ -1,56 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track1;
-
-import java.io.File;
-import java.io.IOException;
-
-import org.apache.commons.cli2.OptionException;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.example.TasteOptionParser;
-import org.apache.mahout.cf.taste.example.kddcup.KDDCupDataModel;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public final class Track1RecommenderEvaluatorRunner {
-
- private static final Logger log = LoggerFactory.getLogger(Track1RecommenderEvaluatorRunner.class);
-
- private Track1RecommenderEvaluatorRunner() {
- }
-
- public static void main(String... args) throws IOException, TasteException, OptionException {
- File dataFileDirectory = TasteOptionParser.getRatings(args);
- if (dataFileDirectory == null) {
- throw new IllegalArgumentException("No data directory");
- }
- if (!dataFileDirectory.exists() || !dataFileDirectory.isDirectory()) {
- throw new IllegalArgumentException("Bad data file directory: " + dataFileDirectory);
- }
- Track1RecommenderEvaluator evaluator = new Track1RecommenderEvaluator(dataFileDirectory);
- DataModel model = new KDDCupDataModel(KDDCupDataModel.getTrainingFile(dataFileDirectory));
- double evaluation = evaluator.evaluate(new Track1RecommenderBuilder(),
- null,
- model,
- Float.NaN,
- Float.NaN);
- log.info(String.valueOf(evaluation));
- }
-
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Runner.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Runner.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Runner.java
deleted file mode 100644
index a0ff126..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Runner.java
+++ /dev/null
@@ -1,95 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track1;
-
-import org.apache.mahout.cf.taste.example.kddcup.DataFileIterable;
-import org.apache.mahout.cf.taste.example.kddcup.KDDCupDataModel;
-import org.apache.mahout.cf.taste.model.PreferenceArray;
-import org.apache.mahout.common.Pair;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.BufferedOutputStream;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.OutputStream;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.List;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.Future;
-
-/**
- * <p>Runs "track 1" of the KDD Cup competition using whatever recommender is inside {@link Track1Recommender}
- * and attempts to output the result in the correct contest format.</p>
- *
- * <p>Run as: {@code Track1Runner [track 1 data file directory] [output file]}</p>
- */
-public final class Track1Runner {
-
- private static final Logger log = LoggerFactory.getLogger(Track1Runner.class);
-
- private Track1Runner() {
- }
-
- public static void main(String[] args) throws Exception {
-
- File dataFileDirectory = new File(args[0]);
- if (!dataFileDirectory.exists() || !dataFileDirectory.isDirectory()) {
- throw new IllegalArgumentException("Bad data file directory: " + dataFileDirectory);
- }
-
- long start = System.currentTimeMillis();
-
- KDDCupDataModel model = new KDDCupDataModel(KDDCupDataModel.getTrainingFile(dataFileDirectory));
- Track1Recommender recommender = new Track1Recommender(model);
-
- long end = System.currentTimeMillis();
- log.info("Loaded model in {}s", (end - start) / 1000);
- start = end;
-
- Collection<Track1Callable> callables = new ArrayList<>();
- for (Pair<PreferenceArray,long[]> tests : new DataFileIterable(KDDCupDataModel.getTestFile(dataFileDirectory))) {
- PreferenceArray userTest = tests.getFirst();
- callables.add(new Track1Callable(recommender, userTest));
- }
-
- int cores = Runtime.getRuntime().availableProcessors();
- log.info("Running on {} cores", cores);
- ExecutorService executor = Executors.newFixedThreadPool(cores);
- List<Future<byte[]>> results = executor.invokeAll(callables);
- executor.shutdown();
-
- end = System.currentTimeMillis();
- log.info("Ran recommendations in {}s", (end - start) / 1000);
- start = end;
-
- try (OutputStream out = new BufferedOutputStream(new FileOutputStream(new File(args[1])))){
- for (Future<byte[]> result : results) {
- for (byte estimate : result.get()) {
- out.write(estimate);
- }
- }
- }
-
- end = System.currentTimeMillis();
- log.info("Wrote output in {}s", (end - start) / 1000);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/DataModelFactorizablePreferences.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/DataModelFactorizablePreferences.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/DataModelFactorizablePreferences.java
deleted file mode 100644
index 022d78c..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/DataModelFactorizablePreferences.java
+++ /dev/null
@@ -1,107 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track1.svd;
-
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.impl.common.FastIDSet;
-import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
-import org.apache.mahout.cf.taste.impl.model.GenericPreference;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.model.Preference;
-
-import java.util.ArrayList;
-import java.util.List;
-
-/**
- * can be used to drop {@link DataModel}s into {@link ParallelArraysSGDFactorizer}
- */
-public class DataModelFactorizablePreferences implements FactorizablePreferences {
-
- private final FastIDSet userIDs;
- private final FastIDSet itemIDs;
-
- private final List<Preference> preferences;
-
- private final float minPreference;
- private final float maxPreference;
-
- public DataModelFactorizablePreferences(DataModel dataModel) {
-
- minPreference = dataModel.getMinPreference();
- maxPreference = dataModel.getMaxPreference();
-
- try {
- userIDs = new FastIDSet(dataModel.getNumUsers());
- itemIDs = new FastIDSet(dataModel.getNumItems());
- preferences = new ArrayList<>();
-
- LongPrimitiveIterator userIDsIterator = dataModel.getUserIDs();
- while (userIDsIterator.hasNext()) {
- long userID = userIDsIterator.nextLong();
- userIDs.add(userID);
- for (Preference preference : dataModel.getPreferencesFromUser(userID)) {
- itemIDs.add(preference.getItemID());
- preferences.add(new GenericPreference(userID, preference.getItemID(), preference.getValue()));
- }
- }
- } catch (TasteException te) {
- throw new IllegalStateException("Unable to create factorizable preferences!", te);
- }
- }
-
- @Override
- public LongPrimitiveIterator getUserIDs() {
- return userIDs.iterator();
- }
-
- @Override
- public LongPrimitiveIterator getItemIDs() {
- return itemIDs.iterator();
- }
-
- @Override
- public Iterable<Preference> getPreferences() {
- return preferences;
- }
-
- @Override
- public float getMinPreference() {
- return minPreference;
- }
-
- @Override
- public float getMaxPreference() {
- return maxPreference;
- }
-
- @Override
- public int numUsers() {
- return userIDs.size();
- }
-
- @Override
- public int numItems() {
- return itemIDs.size();
- }
-
- @Override
- public int numPreferences() {
- return preferences.size();
- }
-}
-
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/FactorizablePreferences.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/FactorizablePreferences.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/FactorizablePreferences.java
deleted file mode 100644
index a126dec..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/FactorizablePreferences.java
+++ /dev/null
@@ -1,44 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track1.svd;
-
-import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
-import org.apache.mahout.cf.taste.model.Preference;
-
-/**
- * models the necessary input for {@link ParallelArraysSGDFactorizer}
- */
-public interface FactorizablePreferences {
-
- LongPrimitiveIterator getUserIDs();
-
- LongPrimitiveIterator getItemIDs();
-
- Iterable<Preference> getPreferences();
-
- float getMinPreference();
-
- float getMaxPreference();
-
- int numUsers();
-
- int numItems();
-
- int numPreferences();
-
-}
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/KDDCupFactorizablePreferences.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/KDDCupFactorizablePreferences.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/KDDCupFactorizablePreferences.java
deleted file mode 100644
index 6dcef6b..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/KDDCupFactorizablePreferences.java
+++ /dev/null
@@ -1,123 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track1.svd;
-
-import com.google.common.base.Function;
-import com.google.common.collect.Iterables;
-import org.apache.mahout.cf.taste.example.kddcup.DataFileIterable;
-import org.apache.mahout.cf.taste.impl.common.AbstractLongPrimitiveIterator;
-import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
-import org.apache.mahout.cf.taste.model.Preference;
-import org.apache.mahout.cf.taste.model.PreferenceArray;
-import org.apache.mahout.common.Pair;
-
-import java.io.File;
-
-public class KDDCupFactorizablePreferences implements FactorizablePreferences {
-
- private final File dataFile;
-
- public KDDCupFactorizablePreferences(File dataFile) {
- this.dataFile = dataFile;
- }
-
- @Override
- public LongPrimitiveIterator getUserIDs() {
- return new FixedSizeLongIterator(numUsers());
- }
-
- @Override
- public LongPrimitiveIterator getItemIDs() {
- return new FixedSizeLongIterator(numItems());
- }
-
- @Override
- public Iterable<Preference> getPreferences() {
- Iterable<Iterable<Preference>> prefIterators =
- Iterables.transform(new DataFileIterable(dataFile),
- new Function<Pair<PreferenceArray,long[]>,Iterable<Preference>>() {
- @Override
- public Iterable<Preference> apply(Pair<PreferenceArray,long[]> from) {
- return from.getFirst();
- }
- });
- return Iterables.concat(prefIterators);
- }
-
- @Override
- public float getMinPreference() {
- return 0;
- }
-
- @Override
- public float getMaxPreference() {
- return 100;
- }
-
- @Override
- public int numUsers() {
- return 1000990;
- }
-
- @Override
- public int numItems() {
- return 624961;
- }
-
- @Override
- public int numPreferences() {
- return 252800275;
- }
-
- static class FixedSizeLongIterator extends AbstractLongPrimitiveIterator {
-
- private long currentValue;
- private final long maximum;
-
- FixedSizeLongIterator(long maximum) {
- this.maximum = maximum;
- currentValue = 0;
- }
-
- @Override
- public long nextLong() {
- return currentValue++;
- }
-
- @Override
- public long peek() {
- return currentValue;
- }
-
- @Override
- public void skip(int n) {
- currentValue += n;
- }
-
- @Override
- public boolean hasNext() {
- return currentValue < maximum;
- }
-
- @Override
- public void remove() {
- throw new UnsupportedOperationException();
- }
- }
-
-}