You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@flink.apache.org by rm...@apache.org on 2014/06/21 11:45:26 UTC
[2/4] The Hadoop Compatibility has been refactored and extended to
support the new Java API.
http://git-wip-us.apache.org/repos/asf/incubator-flink/blob/a65b7591/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/HadoopInputFormat.java
----------------------------------------------------------------------
diff --git a/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/HadoopInputFormat.java b/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/HadoopInputFormat.java
new file mode 100644
index 0000000..882f4a3
--- /dev/null
+++ b/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/HadoopInputFormat.java
@@ -0,0 +1,287 @@
+/***********************************************************************************************************************
+ * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+ * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under the License.
+ **********************************************************************************************************************/
+
+package eu.stratosphere.hadoopcompatibility.mapred;
+
+import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.util.ArrayList;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.RecordReader;
+import org.apache.hadoop.util.ReflectionUtils;
+
+import eu.stratosphere.api.common.io.FileInputFormat.FileBaseStatistics;
+import eu.stratosphere.api.common.io.InputFormat;
+import eu.stratosphere.api.common.io.statistics.BaseStatistics;
+import eu.stratosphere.api.java.tuple.Tuple2;
+import eu.stratosphere.api.java.typeutils.ResultTypeQueryable;
+import eu.stratosphere.api.java.typeutils.TupleTypeInfo;
+import eu.stratosphere.api.java.typeutils.WritableTypeInfo;
+import eu.stratosphere.configuration.Configuration;
+import eu.stratosphere.core.fs.FileStatus;
+import eu.stratosphere.core.fs.FileSystem;
+import eu.stratosphere.core.fs.Path;
+import eu.stratosphere.hadoopcompatibility.mapred.utils.HadoopUtils;
+import eu.stratosphere.hadoopcompatibility.mapred.wrapper.HadoopDummyReporter;
+import eu.stratosphere.hadoopcompatibility.mapred.wrapper.HadoopInputSplit;
+import eu.stratosphere.types.TypeInformation;
+
+public class HadoopInputFormat<K extends Writable, V extends Writable> implements InputFormat<Tuple2<K,V>, HadoopInputSplit>, ResultTypeQueryable<Tuple2<K,V>> {
+
+ private static final long serialVersionUID = 1L;
+
+ private static final Log LOG = LogFactory.getLog(HadoopInputFormat.class);
+
+ private org.apache.hadoop.mapred.InputFormat<K, V> mapredInputFormat;
+ private Class<K> keyClass;
+ private Class<V> valueClass;
+ private JobConf jobConf;
+
+ public transient K key;
+ public transient V value;
+
+ public RecordReader<K, V> recordReader;
+ private transient boolean fetched = false;
+ private transient boolean hasNext;
+
+ public HadoopInputFormat() {
+ super();
+ }
+
+ public HadoopInputFormat(org.apache.hadoop.mapred.InputFormat<K,V> mapredInputFormat, Class<K> key, Class<V> value, JobConf job) {
+ super();
+ this.mapredInputFormat = mapredInputFormat;
+ this.keyClass = key;
+ this.valueClass = value;
+ HadoopUtils.mergeHadoopConf(job);
+ this.jobConf = job;
+ }
+
+ public void setJobConf(JobConf job) {
+ this.jobConf = job;
+ }
+
+ public org.apache.hadoop.mapred.InputFormat<K,V> getHadoopInputFormat() {
+ return mapredInputFormat;
+ }
+
+ public void setHadoopInputFormat(org.apache.hadoop.mapred.InputFormat<K,V> mapredInputFormat) {
+ this.mapredInputFormat = mapredInputFormat;
+ }
+
+ public JobConf getJobConf() {
+ return jobConf;
+ }
+
+ // --------------------------------------------------------------------------------------------
+ // InputFormat
+ // --------------------------------------------------------------------------------------------
+
+ @Override
+ public void configure(Configuration parameters) {
+ // nothing to do
+ }
+
+ @Override
+ public BaseStatistics getStatistics(BaseStatistics cachedStats) throws IOException {
+ // only gather base statistics for FileInputFormats
+ if(!(mapredInputFormat instanceof FileInputFormat)) {
+ return null;
+ }
+
+ final FileBaseStatistics cachedFileStats = (cachedStats != null && cachedStats instanceof FileBaseStatistics) ?
+ (FileBaseStatistics) cachedStats : null;
+
+ try {
+ final org.apache.hadoop.fs.Path[] paths = FileInputFormat.getInputPaths(this.jobConf);
+
+ return getFileStats(cachedFileStats, paths, new ArrayList<FileStatus>(1));
+ } catch (IOException ioex) {
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("Could not determine statistics due to an io error: "
+ + ioex.getMessage());
+ }
+ }
+ catch (Throwable t) {
+ if (LOG.isErrorEnabled()) {
+ LOG.error("Unexpected problen while getting the file statistics: "
+ + t.getMessage(), t);
+ }
+ }
+
+ // no statistics available
+ return null;
+ }
+
+ @Override
+ public HadoopInputSplit[] createInputSplits(int minNumSplits)
+ throws IOException {
+ org.apache.hadoop.mapred.InputSplit[] splitArray = mapredInputFormat.getSplits(jobConf, minNumSplits);
+ HadoopInputSplit[] hiSplit = new HadoopInputSplit[splitArray.length];
+ for(int i=0;i<splitArray.length;i++){
+ hiSplit[i] = new HadoopInputSplit(splitArray[i], jobConf);
+ }
+ return hiSplit;
+ }
+
+ @Override
+ public Class<? extends HadoopInputSplit> getInputSplitType() {
+ return HadoopInputSplit.class;
+ }
+
+ @Override
+ public void open(HadoopInputSplit split) throws IOException {
+ this.recordReader = this.mapredInputFormat.getRecordReader(split.getHadoopInputSplit(), jobConf, new HadoopDummyReporter());
+ key = this.recordReader.createKey();
+ value = this.recordReader.createValue();
+ this.fetched = false;
+ }
+
+ @Override
+ public boolean reachedEnd() throws IOException {
+ if(!fetched) {
+ fetchNext();
+ }
+ return !hasNext;
+ }
+
+ private void fetchNext() throws IOException {
+ hasNext = this.recordReader.next(key, value);
+ fetched = true;
+ }
+
+ @Override
+ public Tuple2<K, V> nextRecord(Tuple2<K, V> record) throws IOException {
+ if(!fetched) {
+ fetchNext();
+ }
+ if(!hasNext) {
+ return null;
+ }
+ record.f0 = key;
+ record.f1 = value;
+ fetched = false;
+ return record;
+ }
+
+ @Override
+ public void close() throws IOException {
+ this.recordReader.close();
+ }
+
+ // --------------------------------------------------------------------------------------------
+ // Helper methods
+ // --------------------------------------------------------------------------------------------
+
+ private FileBaseStatistics getFileStats(FileBaseStatistics cachedStats, org.apache.hadoop.fs.Path[] hadoopFilePaths,
+ ArrayList<FileStatus> files) throws IOException {
+
+ long latestModTime = 0L;
+
+ // get the file info and check whether the cached statistics are still valid.
+ for(org.apache.hadoop.fs.Path hadoopPath : hadoopFilePaths) {
+
+ final Path filePath = new Path(hadoopPath.toUri());
+ final FileSystem fs = FileSystem.get(filePath.toUri());
+
+ final FileStatus file = fs.getFileStatus(filePath);
+ latestModTime = Math.max(latestModTime, file.getModificationTime());
+
+ // enumerate all files and check their modification time stamp.
+ if (file.isDir()) {
+ FileStatus[] fss = fs.listStatus(filePath);
+ files.ensureCapacity(files.size() + fss.length);
+
+ for (FileStatus s : fss) {
+ if (!s.isDir()) {
+ files.add(s);
+ latestModTime = Math.max(s.getModificationTime(), latestModTime);
+ }
+ }
+ } else {
+ files.add(file);
+ }
+ }
+
+ // check whether the cached statistics are still valid, if we have any
+ if (cachedStats != null && latestModTime <= cachedStats.getLastModificationTime()) {
+ return cachedStats;
+ }
+
+ // calculate the whole length
+ long len = 0;
+ for (FileStatus s : files) {
+ len += s.getLen();
+ }
+
+ // sanity check
+ if (len <= 0) {
+ len = BaseStatistics.SIZE_UNKNOWN;
+ }
+
+ return new FileBaseStatistics(latestModTime, len, BaseStatistics.AVG_RECORD_BYTES_UNKNOWN);
+ }
+
+ // --------------------------------------------------------------------------------------------
+ // Custom serialization methods
+ // --------------------------------------------------------------------------------------------
+
+ private void writeObject(ObjectOutputStream out) throws IOException {
+ out.writeUTF(mapredInputFormat.getClass().getName());
+ out.writeUTF(keyClass.getName());
+ out.writeUTF(valueClass.getName());
+ jobConf.write(out);
+ }
+
+ @SuppressWarnings("unchecked")
+ private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
+ String hadoopInputFormatClassName = in.readUTF();
+ String keyClassName = in.readUTF();
+ String valueClassName = in.readUTF();
+ if(jobConf == null) {
+ jobConf = new JobConf();
+ }
+ jobConf.readFields(in);
+ try {
+ this.mapredInputFormat = (org.apache.hadoop.mapred.InputFormat<K,V>) Class.forName(hadoopInputFormatClassName, true, Thread.currentThread().getContextClassLoader()).newInstance();
+ } catch (Exception e) {
+ throw new RuntimeException("Unable to instantiate the hadoop input format", e);
+ }
+ try {
+ this.keyClass = (Class<K>) Class.forName(keyClassName, true, Thread.currentThread().getContextClassLoader());
+ } catch (Exception e) {
+ throw new RuntimeException("Unable to find key class.", e);
+ }
+ try {
+ this.valueClass = (Class<V>) Class.forName(valueClassName, true, Thread.currentThread().getContextClassLoader());
+ } catch (Exception e) {
+ throw new RuntimeException("Unable to find value class.", e);
+ }
+ ReflectionUtils.setConf(mapredInputFormat, jobConf);
+ }
+
+ // --------------------------------------------------------------------------------------------
+ // ResultTypeQueryable
+ // --------------------------------------------------------------------------------------------
+
+ @Override
+ public TypeInformation<Tuple2<K,V>> getProducedType() {
+ return new TupleTypeInfo<Tuple2<K,V>>(new WritableTypeInfo<K>((Class<K>) keyClass), new WritableTypeInfo<V>((Class<V>) valueClass));
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-flink/blob/a65b7591/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/HadoopOutputFormat.java
----------------------------------------------------------------------
diff --git a/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/HadoopOutputFormat.java b/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/HadoopOutputFormat.java
new file mode 100644
index 0000000..849c701
--- /dev/null
+++ b/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/HadoopOutputFormat.java
@@ -0,0 +1,164 @@
+/***********************************************************************************************************************
+ * Copyright (C) 2010-2014 by the Stratosphere project (http://stratosphere.eu)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+ * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under the License.
+ **********************************************************************************************************************/
+
+package eu.stratosphere.hadoopcompatibility.mapred;
+
+import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapred.FileOutputCommitter;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.JobContext;
+import org.apache.hadoop.mapred.JobID;
+import org.apache.hadoop.mapred.RecordWriter;
+import org.apache.hadoop.mapred.TaskAttemptContext;
+import org.apache.hadoop.mapred.TaskAttemptID;
+import org.apache.hadoop.util.ReflectionUtils;
+
+import eu.stratosphere.api.common.io.OutputFormat;
+import eu.stratosphere.api.java.tuple.Tuple2;
+import eu.stratosphere.configuration.Configuration;
+import eu.stratosphere.hadoopcompatibility.mapred.utils.HadoopUtils;
+import eu.stratosphere.hadoopcompatibility.mapred.wrapper.HadoopDummyProgressable;
+import eu.stratosphere.hadoopcompatibility.mapred.wrapper.HadoopDummyReporter;
+
+
+public class HadoopOutputFormat<K extends Writable,V extends Writable> implements OutputFormat<Tuple2<K, V>> {
+
+ private static final long serialVersionUID = 1L;
+
+ public JobConf jobConf;
+ public org.apache.hadoop.mapred.OutputFormat<K,V> mapredOutputFormat;
+ public transient RecordWriter<K,V> recordWriter;
+ public transient FileOutputCommitter fileOutputCommitter;
+ private transient TaskAttemptContext context;
+ private transient JobContext jobContext;
+
+ public HadoopOutputFormat(org.apache.hadoop.mapred.OutputFormat<K,V> mapredOutputFormat, JobConf job) {
+ super();
+ this.mapredOutputFormat = mapredOutputFormat;
+ HadoopUtils.mergeHadoopConf(job);
+ this.jobConf = job;
+ }
+
+ public void setJobConf(JobConf job) {
+ this.jobConf = job;
+ }
+
+ public JobConf getJobConf() {
+ return jobConf;
+ }
+
+ public org.apache.hadoop.mapred.OutputFormat<K,V> getHadoopOutputFormat() {
+ return mapredOutputFormat;
+ }
+
+ public void setHadoopOutputFormat(org.apache.hadoop.mapred.OutputFormat<K,V> mapredOutputFormat) {
+ this.mapredOutputFormat = mapredOutputFormat;
+ }
+
+ // --------------------------------------------------------------------------------------------
+ // OutputFormat
+ // --------------------------------------------------------------------------------------------
+
+ @Override
+ public void configure(Configuration parameters) {
+ // nothing to do
+ }
+
+ /**
+ * create the temporary output file for hadoop RecordWriter.
+ * @param taskNumber The number of the parallel instance.
+ * @param numTasks The number of parallel tasks.
+ * @throws IOException
+ */
+ @Override
+ public void open(int taskNumber, int numTasks) throws IOException {
+ if (Integer.toString(taskNumber + 1).length() > 6) {
+ throw new IOException("Task id too large.");
+ }
+
+ TaskAttemptID taskAttemptID = TaskAttemptID.forName("attempt__0000_r_"
+ + String.format("%" + (6 - Integer.toString(taskNumber + 1).length()) + "s"," ").replace(" ", "0")
+ + Integer.toString(taskNumber + 1)
+ + "_0");
+
+ try {
+ this.context = HadoopUtils.instantiateTaskAttemptContext(this.jobConf, taskAttemptID);
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+
+ this.jobConf.set("mapred.task.id", taskAttemptID.toString());
+ // for hadoop 2.2
+ this.jobConf.set("mapreduce.task.attempt.id", taskAttemptID.toString());
+
+ this.fileOutputCommitter = new FileOutputCommitter();
+
+ try {
+ this.jobContext = HadoopUtils.instantiateJobContext(this.jobConf, new JobID());
+ }
+ catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+
+ this.fileOutputCommitter.setupJob(jobContext);
+
+ this.recordWriter = this.mapredOutputFormat.getRecordWriter(null, this.jobConf, Integer.toString(taskNumber + 1), new HadoopDummyProgressable());
+ }
+
+ @Override
+ public void writeRecord(Tuple2<K, V> record) throws IOException {
+ this.recordWriter.write(record.f0, record.f1);
+ }
+
+ /**
+ * commit the task by moving the output file out from the temporary directory.
+ * @throws IOException
+ */
+ @Override
+ public void close() throws IOException {
+ this.recordWriter.close(new HadoopDummyReporter());
+
+ if (this.fileOutputCommitter.needsTaskCommit(this.context)) {
+ this.fileOutputCommitter.commitTask(this.context);
+ }
+ this.fileOutputCommitter.commitJob(this.jobContext);
+ }
+
+ // --------------------------------------------------------------------------------------------
+ // Custom serialization methods
+ // --------------------------------------------------------------------------------------------
+
+ private void writeObject(ObjectOutputStream out) throws IOException {
+ out.writeUTF(mapredOutputFormat.getClass().getName());
+ jobConf.write(out);
+ }
+
+ @SuppressWarnings("unchecked")
+ private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
+ String hadoopOutputFormatName = in.readUTF();
+ if(jobConf == null) {
+ jobConf = new JobConf();
+ }
+ jobConf.readFields(in);
+ try {
+ this.mapredOutputFormat = (org.apache.hadoop.mapred.OutputFormat<K,V>) Class.forName(hadoopOutputFormatName, true, Thread.currentThread().getContextClassLoader()).newInstance();
+ } catch (Exception e) {
+ throw new RuntimeException("Unable to instantiate the hadoop output format", e);
+ }
+ ReflectionUtils.setConf(mapredOutputFormat, jobConf);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-flink/blob/a65b7591/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/example/WordCount.java
----------------------------------------------------------------------
diff --git a/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/example/WordCount.java b/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/example/WordCount.java
new file mode 100644
index 0000000..54160bf
--- /dev/null
+++ b/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/example/WordCount.java
@@ -0,0 +1,115 @@
+/***********************************************************************************************************************
+ * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+ * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under the License.
+ **********************************************************************************************************************/
+package eu.stratosphere.hadoopcompatibility.mapred.example;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.TextInputFormat;
+import org.apache.hadoop.mapred.TextOutputFormat;
+
+import eu.stratosphere.api.java.DataSet;
+import eu.stratosphere.api.java.ExecutionEnvironment;
+import eu.stratosphere.api.java.aggregation.Aggregations;
+import eu.stratosphere.api.java.functions.FlatMapFunction;
+import eu.stratosphere.api.java.functions.MapFunction;
+import eu.stratosphere.api.java.tuple.Tuple2;
+import eu.stratosphere.hadoopcompatibility.mapred.HadoopInputFormat;
+import eu.stratosphere.hadoopcompatibility.mapred.HadoopOutputFormat;
+import eu.stratosphere.util.Collector;
+
+
+
+/**
+ * Implements a word count which takes the input file and counts the number of
+ * occurrences of each word in the file and writes the result back to disk.
+ *
+ * This example shows how to use Hadoop Input Formats, how to convert Hadoop Writables to
+ * common Java types for better usage in a Stratosphere job and how to use Hadoop Output Formats.
+ */
+@SuppressWarnings("serial")
+public class WordCount {
+
+ public static void main(String[] args) throws Exception {
+ if (args.length < 2) {
+ System.err.println("Usage: WordCount <input path> <result path>");
+ return;
+ }
+
+ final String inputPath = args[0];
+ final String outputPath = args[1];
+
+ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
+ env.setDegreeOfParallelism(1);
+
+ // Set up the Hadoop Input Format
+ HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, new JobConf());
+ TextInputFormat.addInputPath(hadoopInputFormat.getJobConf(), new Path(inputPath));
+
+ // Create a Stratosphere job with it
+ DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);
+
+ // Tokenize the line and convert from Writable "Text" to String for better handling
+ DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer());
+
+ // Sum up the words
+ DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1);
+
+ // Convert String back to Writable "Text" for use with Hadoop Output Format
+ DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper());
+
+ // Set up Hadoop Output Format
+ HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>(new TextOutputFormat<Text, IntWritable>(), new JobConf());
+ hadoopOutputFormat.getJobConf().set("mapred.textoutputformat.separator", " ");
+ TextOutputFormat.setOutputPath(hadoopOutputFormat.getJobConf(), new Path(outputPath));
+
+ // Output & Execute
+ hadoopResult.output(hadoopOutputFormat);
+ env.execute("Word Count");
+ }
+
+ /**
+ * Splits a line into words and converts Hadoop Writables into normal Java data types.
+ */
+ public static final class Tokenizer extends FlatMapFunction<Tuple2<LongWritable, Text>, Tuple2<String, Integer>> {
+
+ @Override
+ public void flatMap(Tuple2<LongWritable, Text> value, Collector<Tuple2<String, Integer>> out) {
+ // normalize and split the line
+ String line = value.f1.toString();
+ String[] tokens = line.toLowerCase().split("\\W+");
+
+ // emit the pairs
+ for (String token : tokens) {
+ if (token.length() > 0) {
+ out.collect(new Tuple2<String, Integer>(token, 1));
+ }
+ }
+ }
+ }
+
+ /**
+ * Converts Java data types to Hadoop Writables.
+ */
+ public static final class HadoopDatatypeMapper extends MapFunction<Tuple2<String, Integer>, Tuple2<Text, IntWritable>> {
+
+ @Override
+ public Tuple2<Text, IntWritable> map(Tuple2<String, Integer> value) throws Exception {
+ return new Tuple2<Text, IntWritable>(new Text(value.f0), new IntWritable(value.f1));
+ }
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-flink/blob/a65b7591/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/HadoopDataSink.java
----------------------------------------------------------------------
diff --git a/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/HadoopDataSink.java b/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/HadoopDataSink.java
new file mode 100644
index 0000000..2d0e052
--- /dev/null
+++ b/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/HadoopDataSink.java
@@ -0,0 +1,102 @@
+/***********************************************************************************************************************
+ * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+ * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under the License.
+ **********************************************************************************************************************/
+
+package eu.stratosphere.hadoopcompatibility.mapred.record;
+
+import java.util.List;
+
+import eu.stratosphere.types.Record;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.OutputFormat;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableList;
+
+import eu.stratosphere.api.java.record.operators.GenericDataSink;
+import eu.stratosphere.api.common.operators.Operator;
+import eu.stratosphere.compiler.contextcheck.Validatable;
+import eu.stratosphere.hadoopcompatibility.mapred.record.datatypes.DefaultStratosphereTypeConverter;
+import eu.stratosphere.hadoopcompatibility.mapred.record.datatypes.StratosphereTypeConverter;
+
+/**
+ * The HadoopDataSink is a generic wrapper for all Hadoop OutputFormats.
+ *
+ * Example usage:
+ * <pre>
+ * HadoopDataSink out = new HadoopDataSink(new org.apache.hadoop.mapred.TextOutputFormat<Text, IntWritable>(), new JobConf(), "Hadoop TextOutputFormat",reducer, Text.class,IntWritable.class);
+ * org.apache.hadoop.mapred.TextOutputFormat.setOutputPath(out.getJobConf(), new Path(output));
+ * </pre>
+ *
+ * Note that it is possible to provide custom data type converter.
+ *
+ * The HadoopDataSink provides a default converter: {@link eu.stratosphere.hadoopcompatibility.mapred.record.datatypes.DefaultStratosphereTypeConverter}
+ **/
+public class HadoopDataSink<K,V> extends GenericDataSink implements Validatable {
+
+ private static String DEFAULT_NAME = "<Unnamed Hadoop Data Sink>";
+
+ private JobConf jobConf;
+
+ public HadoopDataSink(OutputFormat<K,V> hadoopFormat, JobConf jobConf, String name, Operator<Record> input, StratosphereTypeConverter<K,V> conv, Class<K> keyClass, Class<V> valueClass) {
+ this(hadoopFormat, jobConf, name, ImmutableList.<Operator<Record>>of(input), conv, keyClass, valueClass);
+ }
+
+ public HadoopDataSink(OutputFormat<K,V> hadoopFormat, JobConf jobConf, String name, Operator<Record> input, Class<K> keyClass, Class<V> valueClass) {
+ this(hadoopFormat, jobConf, name, input, new DefaultStratosphereTypeConverter<K, V>(keyClass, valueClass), keyClass, valueClass);
+ }
+
+ public HadoopDataSink(OutputFormat<K,V> hadoopFormat, JobConf jobConf, Operator<Record> input, Class<K> keyClass, Class<V> valueClass) {
+ this(hadoopFormat, jobConf, DEFAULT_NAME, input, new DefaultStratosphereTypeConverter<K, V>(keyClass, valueClass), keyClass, valueClass);
+ }
+
+ public HadoopDataSink(OutputFormat<K,V> hadoopFormat, Operator<Record> input, Class<K> keyClass, Class<V> valueClass) {
+ this(hadoopFormat, new JobConf(), DEFAULT_NAME, input, new DefaultStratosphereTypeConverter<K, V>(keyClass, valueClass), keyClass, valueClass);
+ }
+
+
+
+ @SuppressWarnings("deprecation")
+ public HadoopDataSink(OutputFormat<K,V> hadoopFormat, JobConf jobConf, String name, List<Operator<Record>> input, StratosphereTypeConverter<K,V> conv, Class<K> keyClass, Class<V> valueClass) {
+ super(new HadoopRecordOutputFormat<K,V>(hadoopFormat, jobConf, conv),input, name);
+ Preconditions.checkNotNull(hadoopFormat);
+ Preconditions.checkNotNull(jobConf);
+ this.name = name;
+ this.jobConf = jobConf;
+ jobConf.setOutputKeyClass(keyClass);
+ jobConf.setOutputValueClass(valueClass);
+ }
+
+ public HadoopDataSink(OutputFormat<K,V> hadoopFormat, JobConf jobConf, String name, List<Operator<Record>> input, Class<K> keyClass, Class<V> valueClass) {
+ this(hadoopFormat, jobConf, name, input, new DefaultStratosphereTypeConverter<K, V>(keyClass, valueClass), keyClass, valueClass);
+ }
+
+ public HadoopDataSink(OutputFormat<K,V> hadoopFormat, JobConf jobConf, List<Operator<Record>> input, Class<K> keyClass, Class<V> valueClass) {
+ this(hadoopFormat, jobConf, DEFAULT_NAME, input, new DefaultStratosphereTypeConverter<K, V>(keyClass, valueClass), keyClass, valueClass);
+ }
+
+ public HadoopDataSink(OutputFormat<K,V> hadoopFormat, List<Operator<Record>> input, Class<K> keyClass, Class<V> valueClass) {
+ this(hadoopFormat, new JobConf(), DEFAULT_NAME, input, new DefaultStratosphereTypeConverter<K, V>(keyClass, valueClass), keyClass, valueClass);
+ }
+
+ public JobConf getJobConf() {
+ return this.jobConf;
+ }
+
+ @Override
+ public void check() {
+ // see for more details https://github.com/stratosphere/stratosphere/pull/531
+ Preconditions.checkNotNull(FileOutputFormat.getOutputPath(jobConf), "The HadoopDataSink currently expects a correct outputPath.");
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-flink/blob/a65b7591/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/HadoopDataSource.java
----------------------------------------------------------------------
diff --git a/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/HadoopDataSource.java b/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/HadoopDataSource.java
new file mode 100644
index 0000000..f8f2120
--- /dev/null
+++ b/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/HadoopDataSource.java
@@ -0,0 +1,81 @@
+/***********************************************************************************************************************
+ * Copyright (C) 2010-2014 by the Stratosphere project (http://stratosphere.eu)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+ * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under the License.
+ **********************************************************************************************************************/
+
+package eu.stratosphere.hadoopcompatibility.mapred.record;
+
+
+import org.apache.hadoop.mapred.InputFormat;
+import org.apache.hadoop.mapred.JobConf;
+
+import com.google.common.base.Preconditions;
+
+import eu.stratosphere.api.java.record.operators.GenericDataSource;
+import eu.stratosphere.hadoopcompatibility.mapred.record.datatypes.DefaultHadoopTypeConverter;
+import eu.stratosphere.hadoopcompatibility.mapred.record.datatypes.HadoopTypeConverter;
+
+
+
+/**
+ * The HadoopDataSource is a generic wrapper for all Hadoop InputFormats.
+ *
+ * Example usage:
+ * <pre>
+ * HadoopDataSource source = new HadoopDataSource(new org.apache.hadoop.mapred.TextInputFormat(), new JobConf(), "Input Lines");
+ * org.apache.hadoop.mapred.TextInputFormat.addInputPath(source.getJobConf(), new Path(dataInput));
+ * </pre>
+ *
+ * Note that it is possible to provide custom data type converter.
+ *
+ * The HadoopDataSource provides two different standard converters:
+ * * WritableWrapperConverter: Converts Hadoop Types to a record that contains a WritableComparableWrapper (key) and a WritableWrapper
+ * * DefaultHadoopTypeConverter: Converts the standard hadoop types (longWritable, Text) to Stratosphere's standard types.
+ *
+ */
+public class HadoopDataSource<K,V> extends GenericDataSource<HadoopRecordInputFormat<K,V>> {
+
+ private static String DEFAULT_NAME = "<Unnamed Hadoop Data Source>";
+
+ private JobConf jobConf;
+
+ /**
+ *
+ * @param hadoopFormat Implementation of a Hadoop input format
+ * @param jobConf JobConf object (Hadoop)
+ * @param name Name of the DataSource
+ * @param conv Definition of a custom type converter {@link DefaultHadoopTypeConverter}.
+ */
+ public HadoopDataSource(InputFormat<K,V> hadoopFormat, JobConf jobConf, String name, HadoopTypeConverter<K,V> conv) {
+ super(new HadoopRecordInputFormat<K,V>(hadoopFormat, jobConf, conv),name);
+ Preconditions.checkNotNull(hadoopFormat);
+ Preconditions.checkNotNull(jobConf);
+ Preconditions.checkNotNull(conv);
+ this.name = name;
+ this.jobConf = jobConf;
+ }
+
+ public HadoopDataSource(InputFormat<K,V> hadoopFormat, JobConf jobConf, String name) {
+ this(hadoopFormat, jobConf, name, new DefaultHadoopTypeConverter<K,V>() );
+ }
+ public HadoopDataSource(InputFormat<K,V> hadoopFormat, JobConf jobConf) {
+ this(hadoopFormat, jobConf, DEFAULT_NAME);
+ }
+
+ public HadoopDataSource(InputFormat<K,V> hadoopFormat) {
+ this(hadoopFormat, new JobConf(), DEFAULT_NAME);
+ }
+
+ public JobConf getJobConf() {
+ return this.jobConf;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-flink/blob/a65b7591/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/HadoopRecordInputFormat.java
----------------------------------------------------------------------
diff --git a/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/HadoopRecordInputFormat.java b/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/HadoopRecordInputFormat.java
new file mode 100644
index 0000000..7c47aa8
--- /dev/null
+++ b/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/HadoopRecordInputFormat.java
@@ -0,0 +1,167 @@
+/***********************************************************************************************************************
+ * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+ * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under the License.
+ **********************************************************************************************************************/
+
+package eu.stratosphere.hadoopcompatibility.mapred.record;
+
+import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.RecordReader;
+import org.apache.hadoop.util.ReflectionUtils;
+
+import eu.stratosphere.api.common.io.InputFormat;
+import eu.stratosphere.api.common.io.statistics.BaseStatistics;
+import eu.stratosphere.configuration.Configuration;
+import eu.stratosphere.hadoopcompatibility.mapred.record.datatypes.HadoopTypeConverter;
+import eu.stratosphere.hadoopcompatibility.mapred.utils.HadoopUtils;
+import eu.stratosphere.hadoopcompatibility.mapred.wrapper.HadoopDummyReporter;
+import eu.stratosphere.hadoopcompatibility.mapred.wrapper.HadoopInputSplit;
+import eu.stratosphere.types.Record;
+
+public class HadoopRecordInputFormat<K, V> implements InputFormat<Record, HadoopInputSplit> {
+
+ private static final long serialVersionUID = 1L;
+
+ public org.apache.hadoop.mapred.InputFormat<K, V> hadoopInputFormat;
+ public HadoopTypeConverter<K,V> converter;
+ private String hadoopInputFormatName;
+ public JobConf jobConf;
+ public transient K key;
+ public transient V value;
+ public RecordReader<K, V> recordReader;
+ private boolean fetched = false;
+ private boolean hasNext;
+
+ public HadoopRecordInputFormat() {
+ super();
+ }
+
+ public HadoopRecordInputFormat(org.apache.hadoop.mapred.InputFormat<K,V> hadoopInputFormat, JobConf job, HadoopTypeConverter<K,V> conv) {
+ super();
+ this.hadoopInputFormat = hadoopInputFormat;
+ this.hadoopInputFormatName = hadoopInputFormat.getClass().getName();
+ this.converter = conv;
+ HadoopUtils.mergeHadoopConf(job);
+ this.jobConf = job;
+ }
+
+ @Override
+ public void configure(Configuration parameters) {
+
+ }
+
+ @Override
+ public BaseStatistics getStatistics(BaseStatistics cachedStatistics) throws IOException {
+ return null;
+ }
+
+ @Override
+ public HadoopInputSplit[] createInputSplits(int minNumSplits)
+ throws IOException {
+ org.apache.hadoop.mapred.InputSplit[] splitArray = hadoopInputFormat.getSplits(jobConf, minNumSplits);
+ HadoopInputSplit[] hiSplit = new HadoopInputSplit[splitArray.length];
+ for(int i=0;i<splitArray.length;i++){
+ hiSplit[i] = new HadoopInputSplit(splitArray[i], jobConf);
+ }
+ return hiSplit;
+ }
+
+ @Override
+ public Class<? extends HadoopInputSplit> getInputSplitType() {
+ return HadoopInputSplit.class;
+ }
+
+ @Override
+ public void open(HadoopInputSplit split) throws IOException {
+ this.recordReader = this.hadoopInputFormat.getRecordReader(split.getHadoopInputSplit(), jobConf, new HadoopDummyReporter());
+ key = this.recordReader.createKey();
+ value = this.recordReader.createValue();
+ this.fetched = false;
+ }
+
+ private void fetchNext() throws IOException {
+ hasNext = this.recordReader.next(key, value);
+ fetched = true;
+ }
+
+ @Override
+ public boolean reachedEnd() throws IOException {
+ if(!fetched) {
+ fetchNext();
+ }
+ return !hasNext;
+ }
+
+ @Override
+ public Record nextRecord(Record record) throws IOException {
+ if(!fetched) {
+ fetchNext();
+ }
+ if(!hasNext) {
+ return null;
+ }
+ converter.convert(record, key, value);
+ fetched = false;
+ return record;
+ }
+
+ @Override
+ public void close() throws IOException {
+ this.recordReader.close();
+ }
+
+ /**
+ * Custom serialization methods.
+ * @see http://docs.oracle.com/javase/7/docs/api/java/io/Serializable.html
+ */
+ private void writeObject(ObjectOutputStream out) throws IOException {
+ out.writeUTF(hadoopInputFormatName);
+ jobConf.write(out);
+ out.writeObject(converter);
+ }
+
+ @SuppressWarnings("unchecked")
+ private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
+ hadoopInputFormatName = in.readUTF();
+ if(jobConf == null) {
+ jobConf = new JobConf();
+ }
+ jobConf.readFields(in);
+ try {
+ this.hadoopInputFormat = (org.apache.hadoop.mapred.InputFormat<K,V>) Class.forName(this.hadoopInputFormatName).newInstance();
+ } catch (Exception e) {
+ throw new RuntimeException("Unable to instantiate the hadoop input format", e);
+ }
+ ReflectionUtils.setConf(hadoopInputFormat, jobConf);
+ converter = (HadoopTypeConverter<K,V>) in.readObject();
+ }
+
+ public void setJobConf(JobConf job) {
+ this.jobConf = job;
+ }
+
+
+ public org.apache.hadoop.mapred.InputFormat<K,V> getHadoopInputFormat() {
+ return hadoopInputFormat;
+ }
+
+ public void setHadoopInputFormat(org.apache.hadoop.mapred.InputFormat<K,V> hadoopInputFormat) {
+ this.hadoopInputFormat = hadoopInputFormat;
+ }
+
+ public JobConf getJobConf() {
+ return jobConf;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-flink/blob/a65b7591/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/HadoopRecordOutputFormat.java
----------------------------------------------------------------------
diff --git a/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/HadoopRecordOutputFormat.java b/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/HadoopRecordOutputFormat.java
new file mode 100644
index 0000000..edd02a1
--- /dev/null
+++ b/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/HadoopRecordOutputFormat.java
@@ -0,0 +1,151 @@
+/***********************************************************************************************************************
+ * Copyright (C) 2010-2014 by the Stratosphere project (http://stratosphere.eu)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+ * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under the License.
+ **********************************************************************************************************************/
+
+package eu.stratosphere.hadoopcompatibility.mapred.record;
+
+import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.RecordWriter;
+import org.apache.hadoop.mapred.TaskAttemptID;
+import org.apache.hadoop.util.ReflectionUtils;
+
+import eu.stratosphere.api.common.io.OutputFormat;
+import eu.stratosphere.configuration.Configuration;
+import eu.stratosphere.hadoopcompatibility.mapred.record.datatypes.HadoopFileOutputCommitter;
+import eu.stratosphere.hadoopcompatibility.mapred.record.datatypes.StratosphereTypeConverter;
+import eu.stratosphere.hadoopcompatibility.mapred.utils.HadoopUtils;
+import eu.stratosphere.hadoopcompatibility.mapred.wrapper.HadoopDummyProgressable;
+import eu.stratosphere.hadoopcompatibility.mapred.wrapper.HadoopDummyReporter;
+import eu.stratosphere.types.Record;
+
+
+public class HadoopRecordOutputFormat<K,V> implements OutputFormat<Record> {
+
+ private static final long serialVersionUID = 1L;
+
+ public JobConf jobConf;
+
+ public org.apache.hadoop.mapred.OutputFormat<K,V> hadoopOutputFormat;
+
+ private String hadoopOutputFormatName;
+
+ public RecordWriter<K,V> recordWriter;
+
+ public StratosphereTypeConverter<K,V> converter;
+
+ public HadoopFileOutputCommitter fileOutputCommitterWrapper;
+
+ public HadoopRecordOutputFormat(org.apache.hadoop.mapred.OutputFormat<K,V> hadoopFormat, JobConf job, StratosphereTypeConverter<K,V> conv) {
+ super();
+ this.hadoopOutputFormat = hadoopFormat;
+ this.hadoopOutputFormatName = hadoopFormat.getClass().getName();
+ this.converter = conv;
+ this.fileOutputCommitterWrapper = new HadoopFileOutputCommitter();
+ HadoopUtils.mergeHadoopConf(job);
+ this.jobConf = job;
+ }
+
+ @Override
+ public void configure(Configuration parameters) {
+ }
+
+ /**
+ * create the temporary output file for hadoop RecordWriter.
+ * @param taskNumber The number of the parallel instance.
+ * @param numTasks The number of parallel tasks.
+ * @throws IOException
+ */
+ @Override
+ public void open(int taskNumber, int numTasks) throws IOException {
+ this.fileOutputCommitterWrapper.setupJob(this.jobConf);
+ if (Integer.toString(taskNumber + 1).length() <= 6) {
+ this.jobConf.set("mapred.task.id", "attempt__0000_r_" + String.format("%" + (6 - Integer.toString(taskNumber + 1).length()) + "s"," ").replace(" ", "0") + Integer.toString(taskNumber + 1) + "_0");
+ //compatible for hadoop 2.2.0, the temporary output directory is different from hadoop 1.2.1
+ this.jobConf.set("mapreduce.task.output.dir", this.fileOutputCommitterWrapper.getTempTaskOutputPath(this.jobConf,TaskAttemptID.forName(this.jobConf.get("mapred.task.id"))).toString());
+ } else {
+ throw new IOException("task id too large");
+ }
+ this.recordWriter = this.hadoopOutputFormat.getRecordWriter(null, this.jobConf, Integer.toString(taskNumber + 1), new HadoopDummyProgressable());
+ }
+
+
+ @Override
+ public void writeRecord(Record record) throws IOException {
+ K key = this.converter.convertKey(record);
+ V value = this.converter.convertValue(record);
+ this.recordWriter.write(key, value);
+ }
+
+ /**
+ * commit the task by moving the output file out from the temporary directory.
+ * @throws IOException
+ */
+ @Override
+ public void close() throws IOException {
+ this.recordWriter.close(new HadoopDummyReporter());
+ if (this.fileOutputCommitterWrapper.needsTaskCommit(this.jobConf, TaskAttemptID.forName(this.jobConf.get("mapred.task.id")))) {
+ this.fileOutputCommitterWrapper.commitTask(this.jobConf, TaskAttemptID.forName(this.jobConf.get("mapred.task.id")));
+ }
+ //TODO: commitjob when all the tasks are finished
+ }
+
+
+ /**
+ * Custom serialization methods.
+ * @see http://docs.oracle.com/javase/7/docs/api/java/io/Serializable.html
+ */
+ private void writeObject(ObjectOutputStream out) throws IOException {
+ out.writeUTF(hadoopOutputFormatName);
+ jobConf.write(out);
+ out.writeObject(converter);
+ out.writeObject(fileOutputCommitterWrapper);
+ }
+
+ @SuppressWarnings("unchecked")
+ private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
+ hadoopOutputFormatName = in.readUTF();
+ if(jobConf == null) {
+ jobConf = new JobConf();
+ }
+ jobConf.readFields(in);
+ try {
+ this.hadoopOutputFormat = (org.apache.hadoop.mapred.OutputFormat<K,V>) Class.forName(this.hadoopOutputFormatName).newInstance();
+ } catch (Exception e) {
+ throw new RuntimeException("Unable to instantiate the hadoop output format", e);
+ }
+ ReflectionUtils.setConf(hadoopOutputFormat, jobConf);
+ converter = (StratosphereTypeConverter<K,V>) in.readObject();
+ fileOutputCommitterWrapper = (HadoopFileOutputCommitter) in.readObject();
+ }
+
+
+ public void setJobConf(JobConf job) {
+ this.jobConf = job;
+ }
+
+ public JobConf getJobConf() {
+ return jobConf;
+ }
+
+ public org.apache.hadoop.mapred.OutputFormat<K,V> getHadoopOutputFormat() {
+ return hadoopOutputFormat;
+ }
+
+ public void setHadoopOutputFormat(org.apache.hadoop.mapred.OutputFormat<K,V> hadoopOutputFormat) {
+ this.hadoopOutputFormat = hadoopOutputFormat;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-flink/blob/a65b7591/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/datatypes/DefaultHadoopTypeConverter.java
----------------------------------------------------------------------
diff --git a/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/datatypes/DefaultHadoopTypeConverter.java b/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/datatypes/DefaultHadoopTypeConverter.java
new file mode 100644
index 0000000..3832772
--- /dev/null
+++ b/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/datatypes/DefaultHadoopTypeConverter.java
@@ -0,0 +1,78 @@
+/***********************************************************************************************************************
+ * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+ * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under the License.
+ **********************************************************************************************************************/
+
+package eu.stratosphere.hadoopcompatibility.mapred.record.datatypes;
+
+import org.apache.hadoop.io.BooleanWritable;
+import org.apache.hadoop.io.ByteWritable;
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Text;
+
+import eu.stratosphere.types.BooleanValue;
+import eu.stratosphere.types.ByteValue;
+import eu.stratosphere.types.DoubleValue;
+import eu.stratosphere.types.FloatValue;
+import eu.stratosphere.types.IntValue;
+import eu.stratosphere.types.LongValue;
+import eu.stratosphere.types.NullValue;
+import eu.stratosphere.types.Record;
+import eu.stratosphere.types.StringValue;
+import eu.stratosphere.types.Value;
+
+
+/**
+ * Converter for the default hadoop writables.
+ * Key will be in field 0, Value in field 1 of a Stratosphere Record.
+ */
+public class DefaultHadoopTypeConverter<K, V> implements HadoopTypeConverter<K, V> {
+ private static final long serialVersionUID = 1L;
+
+ @Override
+ public void convert(Record stratosphereRecord, K hadoopKey, V hadoopValue) {
+ stratosphereRecord.setField(0, convert(hadoopKey));
+ stratosphereRecord.setField(1, convert(hadoopValue));
+ }
+
+ protected Value convert(Object hadoopType) {
+ if(hadoopType instanceof org.apache.hadoop.io.LongWritable ) {
+ return new LongValue(((LongWritable)hadoopType).get());
+ }
+ if(hadoopType instanceof org.apache.hadoop.io.Text) {
+ return new StringValue(((Text)hadoopType).toString());
+ }
+ if(hadoopType instanceof org.apache.hadoop.io.IntWritable) {
+ return new IntValue(((IntWritable)hadoopType).get());
+ }
+ if(hadoopType instanceof org.apache.hadoop.io.FloatWritable) {
+ return new FloatValue(((FloatWritable)hadoopType).get());
+ }
+ if(hadoopType instanceof org.apache.hadoop.io.DoubleWritable) {
+ return new DoubleValue(((DoubleWritable)hadoopType).get());
+ }
+ if(hadoopType instanceof org.apache.hadoop.io.BooleanWritable) {
+ return new BooleanValue(((BooleanWritable)hadoopType).get());
+ }
+ if(hadoopType instanceof org.apache.hadoop.io.ByteWritable) {
+ return new ByteValue(((ByteWritable)hadoopType).get());
+ }
+ if (hadoopType instanceof NullWritable) {
+ return NullValue.getInstance();
+ }
+
+ throw new RuntimeException("Unable to convert Hadoop type ("+hadoopType.getClass().getCanonicalName()+") to Stratosphere.");
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-flink/blob/a65b7591/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/datatypes/DefaultStratosphereTypeConverter.java
----------------------------------------------------------------------
diff --git a/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/datatypes/DefaultStratosphereTypeConverter.java b/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/datatypes/DefaultStratosphereTypeConverter.java
new file mode 100644
index 0000000..a1850cc
--- /dev/null
+++ b/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/datatypes/DefaultStratosphereTypeConverter.java
@@ -0,0 +1,91 @@
+/***********************************************************************************************************************
+ * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+ * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under the License.
+ **********************************************************************************************************************/
+
+package eu.stratosphere.hadoopcompatibility.mapred.record.datatypes;
+
+import org.apache.hadoop.io.BooleanWritable;
+import org.apache.hadoop.io.ByteWritable;
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+
+import eu.stratosphere.types.BooleanValue;
+import eu.stratosphere.types.ByteValue;
+import eu.stratosphere.types.DoubleValue;
+import eu.stratosphere.types.FloatValue;
+import eu.stratosphere.types.IntValue;
+import eu.stratosphere.types.LongValue;
+import eu.stratosphere.types.Record;
+import eu.stratosphere.types.StringValue;
+
+/**
+ * Converter Stratosphere Record into the default hadoop writables.
+ *
+ */
+public class DefaultStratosphereTypeConverter<K,V> implements StratosphereTypeConverter<K,V> {
+ private static final long serialVersionUID = 1L;
+
+ private Class<K> keyClass;
+ private Class<V> valueClass;
+
+ public DefaultStratosphereTypeConverter(Class<K> keyClass, Class<V> valueClass) {
+ this.keyClass= keyClass;
+ this.valueClass = valueClass;
+ }
+ @Override
+ public K convertKey(Record stratosphereRecord) {
+ if(stratosphereRecord.getNumFields() > 0) {
+ return convert(stratosphereRecord, 0, this.keyClass);
+ } else {
+ return null;
+ }
+ }
+
+ @Override
+ public V convertValue(Record stratosphereRecord) {
+ if(stratosphereRecord.getNumFields() > 1) {
+ return convert(stratosphereRecord, 1, this.valueClass);
+ } else {
+ return null;
+ }
+ }
+
+ @SuppressWarnings("unchecked")
+ private<T> T convert(Record stratosphereType, int pos, Class<T> hadoopType) {
+ if(hadoopType == LongWritable.class ) {
+ return (T) new LongWritable((stratosphereType.getField(pos, LongValue.class)).getValue());
+ }
+ if(hadoopType == org.apache.hadoop.io.Text.class) {
+ return (T) new Text((stratosphereType.getField(pos, StringValue.class)).getValue());
+ }
+ if(hadoopType == org.apache.hadoop.io.IntWritable.class) {
+ return (T) new IntWritable((stratosphereType.getField(pos, IntValue.class)).getValue());
+ }
+ if(hadoopType == org.apache.hadoop.io.FloatWritable.class) {
+ return (T) new FloatWritable((stratosphereType.getField(pos, FloatValue.class)).getValue());
+ }
+ if(hadoopType == org.apache.hadoop.io.DoubleWritable.class) {
+ return (T) new DoubleWritable((stratosphereType.getField(pos, DoubleValue.class)).getValue());
+ }
+ if(hadoopType == org.apache.hadoop.io.BooleanWritable.class) {
+ return (T) new BooleanWritable((stratosphereType.getField(pos, BooleanValue.class)).getValue());
+ }
+ if(hadoopType == org.apache.hadoop.io.ByteWritable.class) {
+ return (T) new ByteWritable((stratosphereType.getField(pos, ByteValue.class)).getValue());
+ }
+
+ throw new RuntimeException("Unable to convert Stratosphere type ("+stratosphereType.getClass().getCanonicalName()+") to Hadoop.");
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-flink/blob/a65b7591/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/datatypes/HadoopFileOutputCommitter.java
----------------------------------------------------------------------
diff --git a/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/datatypes/HadoopFileOutputCommitter.java b/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/datatypes/HadoopFileOutputCommitter.java
new file mode 100644
index 0000000..8f46c00
--- /dev/null
+++ b/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/datatypes/HadoopFileOutputCommitter.java
@@ -0,0 +1,191 @@
+/***********************************************************************************************************************
+ * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+ * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under the License.
+ **********************************************************************************************************************/
+
+package eu.stratosphere.hadoopcompatibility.mapred.record.datatypes;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.net.URI;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapred.FileOutputCommitter;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.TaskAttemptID;
+import org.apache.hadoop.util.StringUtils;
+
+/**
+ * Hadoop 1.2.1 {@link org.apache.hadoop.mapred.FileOutputCommitter} takes {@link org.apache.hadoop.mapred.JobContext}
+ * as input parameter. However JobContext class is package private, and in Hadoop 2.2.0 it's public.
+ * This class takes {@link org.apache.hadoop.mapred.JobConf} as input instead of JobContext in order to setup and commit tasks.
+ */
+public class HadoopFileOutputCommitter extends FileOutputCommitter implements Serializable {
+
+ private static final long serialVersionUID = 1L;
+
+ static final String SUCCESSFUL_JOB_OUTPUT_DIR_MARKER =
+ "mapreduce.fileoutputcommitter.marksuccessfuljobs";
+
+ public void setupJob(JobConf conf) throws IOException {
+ Path outputPath = FileOutputFormat.getOutputPath(conf);
+ if (outputPath != null) {
+ Path tmpDir = new Path(outputPath, FileOutputCommitter.TEMP_DIR_NAME);
+ FileSystem fileSys = tmpDir.getFileSystem(conf);
+ if (!fileSys.mkdirs(tmpDir)) {
+ LOG.error("Mkdirs failed to create " + tmpDir.toString());
+ }
+ }
+ }
+
+ private static boolean getOutputDirMarking(JobConf conf) {
+ return conf.getBoolean(SUCCESSFUL_JOB_OUTPUT_DIR_MARKER,
+ true);
+ }
+
+ private void markSuccessfulOutputDir(JobConf conf)
+ throws IOException {
+ Path outputPath = FileOutputFormat.getOutputPath(conf);
+ if (outputPath != null) {
+ FileSystem fileSys = outputPath.getFileSystem(conf);
+ // create a file in the folder to mark it
+ if (fileSys.exists(outputPath)) {
+ Path filePath = new Path(outputPath, SUCCEEDED_FILE_NAME);
+ fileSys.create(filePath).close();
+ }
+ }
+ }
+
+ private Path getFinalPath(Path jobOutputDir, Path taskOutput,
+ Path taskOutputPath) throws IOException {
+ URI taskOutputUri = taskOutput.toUri();
+ URI relativePath = taskOutputPath.toUri().relativize(taskOutputUri);
+ if (taskOutputUri == relativePath) {//taskOutputPath is not a parent of taskOutput
+ throw new IOException("Can not get the relative path: base = " +
+ taskOutputPath + " child = " + taskOutput);
+ }
+ if (relativePath.getPath().length() > 0) {
+ return new Path(jobOutputDir, relativePath.getPath());
+ } else {
+ return jobOutputDir;
+ }
+ }
+ private void moveTaskOutputs(JobConf conf, TaskAttemptID taskAttemptID,
+ FileSystem fs,
+ Path jobOutputDir,
+ Path taskOutput)
+ throws IOException {
+ if (fs.isFile(taskOutput)) {
+ Path finalOutputPath = getFinalPath(jobOutputDir, taskOutput,
+ getTempTaskOutputPath(conf, taskAttemptID));
+ if (!fs.rename(taskOutput, finalOutputPath)) {
+ if (!fs.delete(finalOutputPath, true)) {
+ throw new IOException("Failed to delete earlier output of task: " +
+ taskAttemptID);
+ }
+ if (!fs.rename(taskOutput, finalOutputPath)) {
+ throw new IOException("Failed to save output of task: " +
+ taskAttemptID);
+ }
+ }
+ LOG.debug("Moved " + taskOutput + " to " + finalOutputPath);
+ } else if(fs.getFileStatus(taskOutput).isDir()) {
+ FileStatus[] paths = fs.listStatus(taskOutput);
+ Path finalOutputPath = getFinalPath(jobOutputDir, taskOutput,
+ getTempTaskOutputPath(conf, taskAttemptID));
+ fs.mkdirs(finalOutputPath);
+ if (paths != null) {
+ for (FileStatus path : paths) {
+ moveTaskOutputs(conf,taskAttemptID, fs, jobOutputDir, path.getPath());
+ }
+ }
+ }
+ }
+
+ public void commitTask(JobConf conf, TaskAttemptID taskAttemptID)
+ throws IOException {
+ Path taskOutputPath = getTempTaskOutputPath(conf, taskAttemptID);
+ if (taskOutputPath != null) {
+ FileSystem fs = taskOutputPath.getFileSystem(conf);
+ if (fs.exists(taskOutputPath)) {
+ Path jobOutputPath = taskOutputPath.getParent().getParent();
+ // Move the task outputs to their final place
+ moveTaskOutputs(conf,taskAttemptID, fs, jobOutputPath, taskOutputPath);
+ // Delete the temporary task-specific output directory
+ if (!fs.delete(taskOutputPath, true)) {
+ LOG.info("Failed to delete the temporary output" +
+ " directory of task: " + taskAttemptID + " - " + taskOutputPath);
+ }
+ LOG.info("Saved output of task '" + taskAttemptID + "' to " +
+ jobOutputPath);
+ }
+ }
+ }
+ public boolean needsTaskCommit(JobConf conf, TaskAttemptID taskAttemptID)
+ throws IOException {
+ try {
+ Path taskOutputPath = getTempTaskOutputPath(conf, taskAttemptID);
+ if (taskOutputPath != null) {
+ // Get the file-system for the task output directory
+ FileSystem fs = taskOutputPath.getFileSystem(conf);
+ // since task output path is created on demand,
+ // if it exists, task needs a commit
+ if (fs.exists(taskOutputPath)) {
+ return true;
+ }
+ }
+ } catch (IOException ioe) {
+ throw ioe;
+ }
+ return false;
+ }
+
+ public Path getTempTaskOutputPath(JobConf conf, TaskAttemptID taskAttemptID) {
+ Path outputPath = FileOutputFormat.getOutputPath(conf);
+ if (outputPath != null) {
+ Path p = new Path(outputPath,
+ (FileOutputCommitter.TEMP_DIR_NAME + Path.SEPARATOR +
+ "_" + taskAttemptID.toString()));
+ try {
+ FileSystem fs = p.getFileSystem(conf);
+ return p.makeQualified(fs);
+ } catch (IOException ie) {
+ LOG.warn(StringUtils.stringifyException(ie));
+ return p;
+ }
+ }
+ return null;
+ }
+ public void cleanupJob(JobConf conf) throws IOException {
+ // do the clean up of temporary directory
+ Path outputPath = FileOutputFormat.getOutputPath(conf);
+ if (outputPath != null) {
+ Path tmpDir = new Path(outputPath, FileOutputCommitter.TEMP_DIR_NAME);
+ FileSystem fileSys = tmpDir.getFileSystem(conf);
+ if (fileSys.exists(tmpDir)) {
+ fileSys.delete(tmpDir, true);
+ }
+ } else {
+ LOG.warn("Output path is null in cleanup");
+ }
+ }
+
+ public void commitJob(JobConf conf) throws IOException {
+ cleanupJob(conf);
+ if (getOutputDirMarking(conf)) {
+ markSuccessfulOutputDir(conf);
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-flink/blob/a65b7591/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/datatypes/HadoopTypeConverter.java
----------------------------------------------------------------------
diff --git a/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/datatypes/HadoopTypeConverter.java b/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/datatypes/HadoopTypeConverter.java
new file mode 100644
index 0000000..83c14e6
--- /dev/null
+++ b/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/datatypes/HadoopTypeConverter.java
@@ -0,0 +1,36 @@
+/***********************************************************************************************************************
+ * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+ * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under the License.
+ **********************************************************************************************************************/
+
+package eu.stratosphere.hadoopcompatibility.mapred.record.datatypes;
+
+import java.io.Serializable;
+
+import eu.stratosphere.types.Record;
+
+
+/**
+ * An interface describing a class that is able to
+ * convert Hadoop types into Stratosphere's Record model.
+ *
+ * The converter must be Serializable.
+ *
+ * Stratosphere provides a DefaultHadoopTypeConverter. Custom implementations should
+ * chain the type converters.
+ */
+public interface HadoopTypeConverter<K, V> extends Serializable {
+
+ /**
+ * Convert a Hadoop type to a Stratosphere type.
+ */
+ public void convert(Record stratosphereRecord, K hadoopKey, V hadoopValue);
+}
http://git-wip-us.apache.org/repos/asf/incubator-flink/blob/a65b7591/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/datatypes/StratosphereTypeConverter.java
----------------------------------------------------------------------
diff --git a/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/datatypes/StratosphereTypeConverter.java b/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/datatypes/StratosphereTypeConverter.java
new file mode 100644
index 0000000..27d710d
--- /dev/null
+++ b/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/datatypes/StratosphereTypeConverter.java
@@ -0,0 +1,37 @@
+/***********************************************************************************************************************
+ * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+ * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under the License.
+ **********************************************************************************************************************/
+
+package eu.stratosphere.hadoopcompatibility.mapred.record.datatypes;
+
+import java.io.Serializable;
+
+import eu.stratosphere.types.Record;
+
+/**
+ * An interface describing a class that is able to
+ * convert Stratosphere's Record into Hadoop types model.
+ *
+ * The converter must be Serializable.
+ *
+ * Stratosphere provides a DefaultStratosphereTypeConverter. Custom implementations should
+ * chain the type converters.
+ */
+public interface StratosphereTypeConverter<K,V> extends Serializable {
+
+ /**
+ * Convert a Stratosphere type to a Hadoop type.
+ */
+ public K convertKey(Record stratosphereRecord);
+
+ public V convertValue(Record stratosphereRecord);
+}
http://git-wip-us.apache.org/repos/asf/incubator-flink/blob/a65b7591/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/datatypes/WritableComparableWrapper.java
----------------------------------------------------------------------
diff --git a/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/datatypes/WritableComparableWrapper.java b/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/datatypes/WritableComparableWrapper.java
new file mode 100644
index 0000000..767f539
--- /dev/null
+++ b/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/datatypes/WritableComparableWrapper.java
@@ -0,0 +1,35 @@
+/***********************************************************************************************************************
+ * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+ * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under the License.
+ **********************************************************************************************************************/
+
+package eu.stratosphere.hadoopcompatibility.mapred.record.datatypes;
+
+import org.apache.hadoop.io.WritableComparable;
+
+import eu.stratosphere.types.Key;
+
+public class WritableComparableWrapper<T extends WritableComparable<T>> extends WritableWrapper<T> implements Key<WritableComparableWrapper<T>> {
+ private static final long serialVersionUID = 1L;
+
+ public WritableComparableWrapper() {
+ super();
+ }
+
+ public WritableComparableWrapper(T toWrap) {
+ super(toWrap);
+ }
+
+ @Override
+ public int compareTo(WritableComparableWrapper<T> o) {
+ return super.value().compareTo(o.value());
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-flink/blob/a65b7591/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/datatypes/WritableWrapper.java
----------------------------------------------------------------------
diff --git a/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/datatypes/WritableWrapper.java b/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/datatypes/WritableWrapper.java
new file mode 100644
index 0000000..d74eb74
--- /dev/null
+++ b/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/datatypes/WritableWrapper.java
@@ -0,0 +1,66 @@
+/***********************************************************************************************************************
+ * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+ * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under the License.
+ **********************************************************************************************************************/
+
+package eu.stratosphere.hadoopcompatibility.mapred.record.datatypes;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.Writable;
+
+import eu.stratosphere.types.Value;
+import eu.stratosphere.util.InstantiationUtil;
+
+public class WritableWrapper<T extends Writable> implements Value {
+ private static final long serialVersionUID = 2L;
+
+ private T wrapped;
+ private String wrappedType;
+ private ClassLoader cl;
+
+ public WritableWrapper() {
+ }
+
+ public WritableWrapper(T toWrap) {
+ wrapped = toWrap;
+ wrappedType = toWrap.getClass().getCanonicalName();
+ }
+
+ public T value() {
+ return wrapped;
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeUTF(wrappedType);
+ wrapped.write(out);
+ }
+
+ @Override
+ public void read(DataInput in) throws IOException {
+ if(cl == null) {
+ cl = Thread.currentThread().getContextClassLoader();
+ }
+ wrappedType = in.readUTF();
+ try {
+ @SuppressWarnings("unchecked")
+ Class<T> wrClass = (Class<T>) Class.forName(wrappedType, true, cl).asSubclass(Writable.class);
+ wrapped = InstantiationUtil.instantiate(wrClass, Writable.class);
+ } catch (ClassNotFoundException e) {
+ throw new RuntimeException("Error creating the WritableWrapper", e);
+ }
+ wrapped.readFields(in);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-flink/blob/a65b7591/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/datatypes/WritableWrapperConverter.java
----------------------------------------------------------------------
diff --git a/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/datatypes/WritableWrapperConverter.java b/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/datatypes/WritableWrapperConverter.java
new file mode 100644
index 0000000..2a42c51
--- /dev/null
+++ b/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/datatypes/WritableWrapperConverter.java
@@ -0,0 +1,40 @@
+/***********************************************************************************************************************
+ * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+ * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under the License.
+ **********************************************************************************************************************/
+
+package eu.stratosphere.hadoopcompatibility.mapred.record.datatypes;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableComparable;
+
+import eu.stratosphere.types.Record;
+import eu.stratosphere.types.Value;
+
+@SuppressWarnings("rawtypes")
+public class WritableWrapperConverter<K extends WritableComparable, V extends Writable> implements HadoopTypeConverter<K,V> {
+ private static final long serialVersionUID = 1L;
+
+ @Override
+ public void convert(Record stratosphereRecord, K hadoopKey, V hadoopValue) {
+ stratosphereRecord.setField(0, convertKey(hadoopKey));
+ stratosphereRecord.setField(1, convertValue(hadoopValue));
+ }
+
+ @SuppressWarnings("unchecked")
+ private final Value convertKey(K in) {
+ return new WritableComparableWrapper(in);
+ }
+
+ private final Value convertValue(V in) {
+ return new WritableWrapper<V>(in);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-flink/blob/a65b7591/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/example/WordCount.java
----------------------------------------------------------------------
diff --git a/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/example/WordCount.java b/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/example/WordCount.java
new file mode 100644
index 0000000..e135ed5
--- /dev/null
+++ b/stratosphere-addons/hadoop-compatibility/src/main/java/eu/stratosphere/hadoopcompatibility/mapred/record/example/WordCount.java
@@ -0,0 +1,179 @@
+/***********************************************************************************************************************
+ * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+ * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under the License.
+ **********************************************************************************************************************/
+
+package eu.stratosphere.hadoopcompatibility.mapred.record.example;
+
+import java.io.Serializable;
+import java.util.Iterator;
+import java.util.StringTokenizer;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.TextInputFormat;
+
+import eu.stratosphere.api.common.Plan;
+import eu.stratosphere.api.common.Program;
+import eu.stratosphere.api.common.ProgramDescription;
+import eu.stratosphere.api.java.record.functions.FunctionAnnotation.ConstantFields;
+import eu.stratosphere.api.java.record.functions.MapFunction;
+import eu.stratosphere.api.java.record.functions.ReduceFunction;
+import eu.stratosphere.api.java.record.io.CsvOutputFormat;
+import eu.stratosphere.api.java.record.operators.FileDataSink;
+import eu.stratosphere.api.java.record.operators.MapOperator;
+import eu.stratosphere.api.java.record.operators.ReduceOperator;
+import eu.stratosphere.api.java.record.operators.ReduceOperator.Combinable;
+import eu.stratosphere.client.LocalExecutor;
+import eu.stratosphere.hadoopcompatibility.mapred.record.HadoopDataSource;
+import eu.stratosphere.hadoopcompatibility.mapred.record.datatypes.WritableWrapperConverter;
+import eu.stratosphere.types.IntValue;
+import eu.stratosphere.types.Record;
+import eu.stratosphere.types.StringValue;
+import eu.stratosphere.util.Collector;
+
+/**
+ * Implements a word count which takes the input file and counts the number of
+ * the occurrences of each word in the file.
+ *
+ * <br /><br />
+ *
+ * <b>Note</b>: This example uses the out dated Record API.
+ * It is recommended to use the new Java API.
+ *
+ * @see eu.stratosphere.hadoopcompatibility.mapred.record.example.example.WordCount
+ */
+public class WordCount implements Program, ProgramDescription {
+
+ private static final long serialVersionUID = 1L;
+
+
+ /**
+ * Converts a Record containing one string in to multiple string/integer pairs.
+ * The string is tokenized by whitespaces. For each token a new record is emitted,
+ * where the token is the first field and an Integer(1) is the second field.
+ */
+ public static class TokenizeLine extends MapFunction implements Serializable {
+ private static final long serialVersionUID = 1L;
+
+ @Override
+ public void map(Record record, Collector<Record> collector) {
+ // get the first field (as type StringValue) from the record
+ String line = record.getField(1, StringValue.class).getValue();
+ // normalize the line
+ line = line.replaceAll("\\W+", " ").toLowerCase();
+
+ // tokenize the line
+ StringTokenizer tokenizer = new StringTokenizer(line);
+ while (tokenizer.hasMoreTokens()) {
+ String word = tokenizer.nextToken();
+
+ // we emit a (word, 1) pair
+ collector.collect(new Record(new StringValue(word), new IntValue(1)));
+ }
+ }
+ }
+
+ /**
+ * Sums up the counts for a certain given key. The counts are assumed to be at position <code>1</code>
+ * in the record. The other fields are not modified.
+ */
+ @Combinable
+ @ConstantFields(0)
+ public static class CountWords extends ReduceFunction implements Serializable {
+
+ private static final long serialVersionUID = 1L;
+
+ @Override
+ public void reduce(Iterator<Record> records, Collector<Record> out) throws Exception {
+ Record element = null;
+ int sum = 0;
+ while (records.hasNext()) {
+ element = records.next();
+ int cnt = element.getField(1, IntValue.class).getValue();
+ sum += cnt;
+ }
+
+ element.setField(1, new IntValue(sum));
+ out.collect(element);
+ }
+
+ @Override
+ public void combine(Iterator<Record> records, Collector<Record> out) throws Exception {
+ // the logic is the same as in the reduce function, so simply call the reduce method
+ reduce(records, out);
+ }
+ }
+
+
+ @SuppressWarnings({ "rawtypes", "unchecked", "unused" })
+ @Override
+ public Plan getPlan(String... args) {
+ // parse job parameters
+ int numSubTasks = (args.length > 0 ? Integer.parseInt(args[0]) : 1);
+ String dataInput = (args.length > 1 ? args[1] : "");
+ String output = (args.length > 2 ? args[2] : "");
+
+
+ HadoopDataSource source = new HadoopDataSource(new TextInputFormat(), new JobConf(), "Input Lines");
+ TextInputFormat.addInputPath(source.getJobConf(), new Path(dataInput));
+
+ // Example with Wrapper Converter
+ HadoopDataSource<LongWritable,Text> sourceHadoopType = new HadoopDataSource<LongWritable, Text>(
+ new TextInputFormat(), new JobConf(), "Input Lines", new WritableWrapperConverter<LongWritable, Text>());
+ TextInputFormat.addInputPath(source.getJobConf(), new Path(dataInput));
+
+ MapOperator mapper = MapOperator.builder(new TokenizeLine())
+ .input(source)
+ .name("Tokenize Lines")
+ .build();
+ ReduceOperator reducer = ReduceOperator.builder(CountWords.class, StringValue.class, 0)
+ .input(mapper)
+ .name("Count Words")
+ .build();
+ FileDataSink out = new FileDataSink(new CsvOutputFormat(), output, reducer, "Word Counts");
+ CsvOutputFormat.configureRecordFormat(out)
+ .recordDelimiter('\n')
+ .fieldDelimiter(' ')
+ .field(StringValue.class, 0)
+ .field(IntValue.class, 1);
+
+ Plan plan = new Plan(out, "WordCount Example");
+ plan.setDefaultParallelism(numSubTasks);
+ return plan;
+ }
+
+
+ @Override
+ public String getDescription() {
+ return "Parameters: [numSubStasks] [input] [output]";
+ }
+
+
+ public static void main(String[] args) throws Exception {
+ WordCount wc = new WordCount();
+
+ if (args.length < 3) {
+ System.err.println(wc.getDescription());
+ System.exit(1);
+ }
+
+ Plan plan = wc.getPlan(args);
+
+ // This will execute the word-count embedded in a local context. replace this line by the commented
+ // succeeding line to send the job to a local installation or to a cluster for execution
+ LocalExecutor.execute(plan);
+// PlanExecutor ex = new RemoteExecutor("localhost", 6123, "target/pact-examples-0.4-SNAPSHOT-WordCount.jar");
+// ex.executePlan(plan);
+ }
+}