You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@systemds.apache.org by GitBox <gi...@apache.org> on 2021/06/08 08:26:30 UTC

[GitHub] [systemds] Baunsgaard commented on a change in pull request #1299: [SYSTEMDS-206] HDF5 File Format

Baunsgaard commented on a change in pull request #1299:
URL: https://github.com/apache/systemds/pull/1299#discussion_r647187341



##########
File path: src/main/java/org/apache/sysds/parser/DMLTranslator.java
##########
@@ -1058,6 +1058,10 @@ public void constructHops(StatementBlock sb) {
 					case FEDERATED:
 						ae.setOutputParams(ae.getDim1(), ae.getDim2(), -1, ae.getUpdateType(), -1);
 						break;
+					case HDF5:
+						// write output in HDF5 format
+						ae.setOutputParams(ae.getDim1(), ae.getDim2(), ae.getNnz(), ae.getUpdateType(), ConfigurationManager.getBlocksize());

Review comment:
       could this not use the same method call as Binary?

##########
File path: src/main/java/org/apache/sysds/parser/DataExpression.java
##########
@@ -112,6 +112,9 @@
 	public static final String DELIM_NA_STRING_SEP = "\u00b7";
 	// Parameter names relevant to reading/writing delimited index/libsvmv files
 	public static final String LIBSVM_INDEX_DELIM = "indSep";
+
+	// Parameter names relevant to reading/writing dataset name/hdf5 files
+	public static final String HDF5_DATASET_NAME = "dataset";

Review comment:
       Does this mean that only data inside the HDF5 file, called "dataset" i read? or how does this contribute to the reading?

##########
File path: src/main/java/org/apache/sysds/parser/DataExpression.java
##########
@@ -1205,7 +1212,9 @@ else if( getVarParam(READNNZPARAM) != null ) {
 					}
 				}
 			}
-			
+			boolean isHDF5;
+			isHDF5 = (formatTypeString != null && formatTypeString.equalsIgnoreCase(FileFormat.HDF5.toString()));

Review comment:
       this could be done on one line

##########
File path: src/main/java/org/apache/sysds/runtime/instructions/cp/VariableCPInstruction.java
##########
@@ -48,14 +48,7 @@
 import org.apache.sysds.runtime.data.TensorBlock;
 import org.apache.sysds.runtime.instructions.Instruction;
 import org.apache.sysds.runtime.instructions.InstructionUtils;
-import org.apache.sysds.runtime.io.FileFormatProperties;
-import org.apache.sysds.runtime.io.FileFormatPropertiesCSV;
-import org.apache.sysds.runtime.io.FileFormatPropertiesLIBSVM;
-import org.apache.sysds.runtime.io.IOUtilFunctions;
-import org.apache.sysds.runtime.io.ListReader;
-import org.apache.sysds.runtime.io.ListWriter;
-import org.apache.sysds.runtime.io.WriterMatrixMarket;
-import org.apache.sysds.runtime.io.WriterTextCSV;
+import org.apache.sysds.runtime.io.*;

Review comment:
       no wildcard imports, you should update your IDE to not allow this.

##########
File path: src/main/java/org/apache/sysds/runtime/io/InputOutputInfo.java
##########
@@ -82,6 +84,7 @@ public static InputOutputInfo get(DataType dt, FileFormat fmt) {
 			case MM:     return MatrixMarketInputOutputInfo;
 			case CSV:    return CSVInputOutputInfo;
 			case LIBSVM: return LIBSVMInputOutputInfo;
+			case HDF5: return HDF5InputOutputInfo;

Review comment:
       indentation

##########
File path: src/main/java/org/apache/sysds/runtime/io/MatrixWriterFactory.java
##########
@@ -77,6 +77,15 @@ public static MatrixWriter createMatrixWriter(FileFormat fmt, int replication, F
 					writer = new WriterBinaryBlock(replication);
 				break;
 
+			case HDF5:
+				if(props != null && !(props instanceof FileFormatPropertiesHDF5))
+					throw new DMLRuntimeException("Wrong type of file format properties for HDF5 writer.");
+				if( ConfigurationManager.getCompilerConfigFlag(ConfigType.PARALLEL_CP_WRITE_TEXTFORMATS) )
+					writer = new WriterHDF5Parallel((FileFormatPropertiesHDF5) props);
+				else
+					writer = new WriterHDF5((FileFormatPropertiesHDF5) props);

Review comment:
       i know you copied from the other implementations, 
   but i would change the syntaxto:
   if ...
   else if ...
   else ...
   
   and i would return the writer directly rather than assigning it to the writer.
   this remove the break line as well.

##########
File path: src/main/java/org/apache/sysds/runtime/io/ReaderHDF5.java
##########
@@ -0,0 +1,169 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.io;
+
+import org.apache.commons.lang.mutable.MutableInt;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.sysds.conf.ConfigurationManager;
+import org.apache.sysds.runtime.DMLRuntimeException;
+import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.io.hdf5.H5;
+import org.apache.sysds.runtime.io.hdf5.H5ContiguousDataset;
+import org.apache.sysds.runtime.io.hdf5.H5RootObject;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+public class ReaderHDF5 extends MatrixReader {
+	protected final FileFormatPropertiesHDF5 _props;
+
+	public ReaderHDF5(FileFormatPropertiesHDF5 props) {
+		_props = props;
+	}
+
+	@Override public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int blen, long estnnz)
+		throws IOException, DMLRuntimeException {
+		//allocate output matrix block
+		MatrixBlock ret = null;
+		if(rlen >= 0 && clen >= 0) //otherwise allocated on read
+			ret = createOutputMatrixBlock(rlen, clen, (int) rlen, estnnz, true, false);
+
+		//prepare file access
+		JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
+		Path path = new Path(fname);
+		FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
+
+		//check existence and non-empty file
+		checkValidInputFile(fs, path);
+		//core read 
+		ret = readHDF5MatrixFromHDFS(path, job, fs, ret, rlen, clen, blen, _props.getDatasetName());
+
+		//finally check if change of sparse/dense block representation required
+		//(nnz explicitly maintained during read)
+		ret.examSparsity();
+
+		return ret;
+	}
+
+	@Override public MatrixBlock readMatrixFromInputStream(InputStream is, long rlen, long clen, int blen, long estnnz)
+		throws IOException, DMLRuntimeException {
+		//allocate output matrix block
+		MatrixBlock ret = createOutputMatrixBlock(rlen, clen, (int) rlen, estnnz, true, false);
+
+		//core read
+		String datasetName = _props.getDatasetName();
+
+		BufferedInputStream bis = new BufferedInputStream(is);
+
+		long lnnz = readMatrixFromHDF5(bis, datasetName, ret, new MutableInt(0), rlen, clen, blen);
+
+		//finally check if change of sparse/dense block representation required
+		ret.setNonZeros(lnnz);
+		ret.examSparsity();
+
+		return ret;
+	}
+
+	@SuppressWarnings("unchecked") private static MatrixBlock readHDF5MatrixFromHDFS(Path path, JobConf job,
+		FileSystem fs, MatrixBlock dest, long rlen, long clen, int blen, String datasetName)
+		throws IOException, DMLRuntimeException {
+		//prepare file paths in alphanumeric order
+		ArrayList<Path> files = new ArrayList<>();
+		if(fs.isDirectory(path)) {
+			for(FileStatus stat : fs.listStatus(path, IOUtilFunctions.hiddenFileFilter))
+				files.add(stat.getPath());
+			Collections.sort(files);
+		}
+		else
+			files.add(path);
+
+		//determine matrix size via additional pass if required
+		if(dest == null) {
+			dest = computeHDF5Size(files, fs, datasetName);
+			clen = dest.getNumColumns();
+			rlen = dest.getNumRows();
+		}
+
+		//actual read of individual files
+		long lnnz = 0;
+		MutableInt row = new MutableInt(0);
+		for(int fileNo = 0; fileNo < files.size(); fileNo++) {
+			BufferedInputStream bis = new BufferedInputStream(fs.open(files.get(fileNo)));
+			lnnz += readMatrixFromHDF5(bis, datasetName, dest, row, rlen, clen, blen);
+		}
+		//post processing
+		dest.setNonZeros(lnnz);
+
+		return dest;
+	}
+
+	public static long readMatrixFromHDF5(BufferedInputStream bis, String datasetName, MatrixBlock dest,

Review comment:
       if these methods are only used in this class, make them private

##########
File path: src/main/java/org/apache/sysds/runtime/io/WriterHDF5Parallel.java
##########
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.io;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocalFileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.sysds.common.Types;
+import org.apache.sysds.conf.DMLConfig;
+import org.apache.sysds.hops.OptimizerUtils;
+import org.apache.sysds.runtime.DMLRuntimeException;
+import org.apache.sysds.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.util.CommonThreadPool;
+import org.apache.sysds.runtime.util.HDFSTool;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Future;
+
+public class WriterHDF5Parallel extends WriterHDF5 {
+
+	public WriterHDF5Parallel(FileFormatPropertiesHDF5 _props) {
+		super(_props);
+	}
+
+	@Override public void writeHDF5MatrixToHDFS(Path path, JobConf job, FileSystem fs, MatrixBlock src)

Review comment:
       \@Override

##########
File path: src/main/java/org/apache/sysds/runtime/io/hdf5/H5BufferBuilder.java
##########
@@ -0,0 +1,182 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+package org.apache.sysds.runtime.io.hdf5;
+
+import java.io.ByteArrayOutputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.util.Arrays;
+import java.util.BitSet;
+
+import static java.nio.ByteOrder.LITTLE_ENDIAN;
+
+public class H5BufferBuilder {
+
+	private final ByteArrayOutputStream byteArrayOutputStream;
+	private final DataOutputStream dataOutputStream;
+	private ByteOrder byteOrder = LITTLE_ENDIAN;
+
+	public H5BufferBuilder() {
+		this.byteArrayOutputStream = new ByteArrayOutputStream();
+		this.dataOutputStream = new DataOutputStream(byteArrayOutputStream);
+	}
+
+	public int getSize() {
+		return dataOutputStream.size();
+	}
+
+	public void writeByte(int i) {
+		try {
+			dataOutputStream.writeByte(i);
+		}
+		catch(IOException e) {
+			throw new H5Exception(e);

Review comment:
       these H5Exceptions are Runtime Exceptions, maybe change the name to H5RuntimeException?

##########
File path: src/main/java/org/apache/sysds/runtime/io/hdf5/H5Constants.java
##########
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.io.hdf5;
+
+public final class H5Constants {
+
+	private H5Constants() {
+		throw new AssertionError("No instances of Constants");

Review comment:
       since you made the class final there is no way to extend it with new constructors, i don't think you need to make this here.

##########
File path: src/main/java/org/apache/sysds/runtime/io/hdf5/H5RootObject.java
##########
@@ -0,0 +1,260 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.io.hdf5;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import static java.nio.ByteOrder.LITTLE_ENDIAN;
+
+public class H5RootObject {
+
+	protected BufferedInputStream bufferedInputStream;
+	protected BufferedOutputStream bufferedOutputStream;
+	//------------------
+	//protected FileChannel fileChannel;

Review comment:
       Remove commented lines

##########
File path: src/main/java/org/apache/sysds/runtime/instructions/cp/VariableCPInstruction.java
##########
@@ -975,8 +988,10 @@ else if( getInput1().getDataType() == DataType.MATRIX ) {
 				writeMMFile(ec, fname);
 			else if( fmt == FileFormat.CSV )
 				writeCSVFile(ec, fname);
-			else if(fmt == FileFormat.LIBSVM)
-				writeLIBSVMFile(ec, fname);
+       		else if(fmt == FileFormat.LIBSVM)
+        		writeLIBSVMFile(ec, fname);

Review comment:
       Indentation

##########
File path: src/main/java/org/apache/sysds/runtime/io/hdf5/message/H5ObjectModificationTimeMessage.java
##########
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+package org.apache.sysds.runtime.io.hdf5.message;
+
+import org.apache.sysds.runtime.io.hdf5.*;

Review comment:
       No Wildcards inports

##########
File path: src/test/java/org/apache/sysds/test/functions/io/hdf5/WriteHDF5Test.java
##########
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.io.hdf5;
+
+import org.apache.sysds.api.DMLScript;
+import org.apache.sysds.common.Types.ExecMode;
+import org.apache.sysds.conf.CompilerConfig;
+import org.apache.sysds.test.TestConfiguration;
+import org.junit.Test;
+
+public abstract class WriteHDF5Test extends WriteHDF5TestBase {
+
+	protected abstract int getId();
+
+	protected String getOutputHDF5FileName() {
+		return "transfusion_"+getId()+".h5";
+	}
+
+	private final static double eps = 1e-9;
+
+	@Test public void testHDF51_Seq_CP() {
+		runWriteHDF5Test(getId(), ExecMode.SINGLE_NODE, false, 100, 100);
+	}
+
+	@Test public void testHDF52_Seq_CP() {
+		runWriteHDF5Test(getId(), ExecMode.SINGLE_NODE, false, 1000, 1000);
+	}
+
+	@Test public void testHDF53_Seq_CP() {
+		runWriteHDF5Test(getId(), ExecMode.SINGLE_NODE, false, 5000, 5000);
+	}
+
+	@Test public void testHDF51_Parallel_CP() {
+		runWriteHDF5Test(getId(), ExecMode.SINGLE_NODE, true, 1000, 1000);
+	}
+
+	@Test public void testHDF52_Parallel_CP() {
+		runWriteHDF5Test(getId(), ExecMode.SINGLE_NODE, true, 5000, 5000);
+	}
+
+	protected void runWriteHDF5Test(int testNumber, ExecMode platform, boolean parallel, int rows, int cols) {
+
+		ExecMode oldPlatform = rtplatform;
+		rtplatform = platform;
+
+		boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
+		if(rtplatform == ExecMode.SPARK)
+			DMLScript.USE_LOCAL_SPARK_CONFIG = true;
+
+		boolean oldpar = CompilerConfig.FLAG_PARREADWRITE_TEXT;
+
+		try {
+
+			CompilerConfig.FLAG_PARREADWRITE_TEXT = parallel;
+
+			TestConfiguration config = getTestConfiguration(getTestName());
+			loadTestConfiguration(config);
+
+			String HOME = SCRIPT_DIR + TEST_DIR;
+			String outMatrixName = HOME + OUTPUT_DIR + getOutputHDF5FileName();
+			String dmlOutput = output("dml.scalar");
+
+			String datasetName = "DATASET_1";
+
+			//TODO: add RScript for verify file format
+
+			fullDMLScriptName = HOME + getTestName() + "_" + testNumber + ".dml";
+			programArgs = new String[] {"-args", input("A"), outMatrixName, datasetName};
+
+			//generate actual datasets
+			double[][] A = getRandomMatrix(rows, cols, 0, 10, 1, 714);
+			writeInputMatrixWithMTD("A", A, false);
+
+			runTest(true, false, null, -1);

Review comment:
       Missing verify of the written matrix.

##########
File path: src/main/java/org/apache/sysds/runtime/io/hdf5/message/H5FillValueMessage.java
##########
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+package org.apache.sysds.runtime.io.hdf5.message;
+
+import org.apache.sysds.runtime.io.hdf5.H5BufferBuilder;
+import org.apache.sysds.runtime.io.hdf5.H5Constants;
+import org.apache.sysds.runtime.io.hdf5.H5RootObject;
+import org.apache.sysds.runtime.io.hdf5.Utils;
+
+import java.nio.ByteBuffer;
+import java.util.BitSet;
+
+public class H5FillValueMessage extends H5Message {
+
+	private final int spaceAllocationTime;
+	private final int fillValueWriteTime;
+	private final boolean fillValueDefined;
+	private final ByteBuffer fillValue;
+
+	public H5FillValueMessage(H5RootObject rootObject, BitSet flags, int spaceAllocationTime, int fillValueWriteTime,
+		boolean fillValueDefined) {
+
+		super(rootObject, flags);
+		this.spaceAllocationTime = spaceAllocationTime;
+		this.fillValueWriteTime = fillValueWriteTime;
+		this.fillValueDefined = fillValueDefined;
+		this.fillValue = null;
+	}
+
+	public H5FillValueMessage(H5RootObject rootObject, BitSet flags, ByteBuffer bb) {
+		super(rootObject, flags);
+
+		// version
+		rootObject.setFillValueVersion(bb.get());
+
+		spaceAllocationTime = bb.get();
+		fillValueWriteTime = bb.get();
+		boolean fillValueMaybeDefined = bb.get() == 1;
+
+		if(fillValueMaybeDefined) {
+			int size = Utils.readBytesAsUnsignedInt(bb, 4);
+			if(size > 0) {
+				fillValue = Utils.createSubBuffer(bb, size);
+				fillValueDefined = true;
+			}
+			else {
+				fillValue = null;
+				fillValueDefined = false;
+			}
+		}
+		else {
+			fillValue = null;
+			fillValueDefined = false;
+		}
+	}
+
+	@Override public void toBuffer(H5BufferBuilder bb) {

Review comment:
       \@Override ...

##########
File path: src/main/java/org/apache/sysds/runtime/io/hdf5/H5.java
##########
@@ -0,0 +1,234 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.io.hdf5;
+
+import org.apache.sysds.runtime.io.hdf5.message.H5SymbolTableMessage;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.nio.ByteBuffer;
+import java.util.List;
+
+public class H5 {
+
+	// H5 format write/read steps:
+	// 1. Create/Open a File (H5Fcreate)
+	// 2. Create/open a Dataspace
+	// 3. Create/Open a Dataset
+	// 4. Write/Read
+	// 5. Close File
+
+	public static H5RootObject H5Fopen(BufferedInputStream bis) {
+		bis.mark(0);
+
+		H5RootObject rootObject = new H5RootObject();
+		try {
+			// Find out if the file is a HDF5 file
+			int maxSignatureLength = 2048;
+			boolean validSignature = false;
+			long offset;
+			for(offset = 0; offset < maxSignatureLength; offset = nextOffset(offset)) {
+				validSignature = H5Superblock.verifySignature(bis, offset);
+				if(validSignature) {
+					break;
+				}
+			}
+			if(!validSignature) {
+				throw new H5Exception("No valid HDF5 signature found");
+			}
+			rootObject.setBufferedInputStream(bis);
+
+			final H5Superblock superblock = new H5Superblock(bis, offset);
+			rootObject.setSuperblock(superblock);
+		}
+		catch(Exception exception) {
+			//exception.printStackTrace();
+			throw new H5Exception("Can't open fine " + exception);
+		}
+		return rootObject;
+	}
+
+	private static long nextOffset(long offset) {
+		if(offset == 0) {
+			return 512L;
+		}
+		return offset * 2;
+	}
+
+	// Create Data Space
+	public static H5RootObject H5Screate(BufferedOutputStream bos, long row, long col) {
+
+		try {
+			H5RootObject rootObject = new H5RootObject();
+			rootObject.setBufferedOutputStream(bos);
+			rootObject.bufferBuilder = new H5BufferBuilder();
+			final H5Superblock superblock = new H5Superblock();
+			superblock.versionOfSuperblock = 0;
+			superblock.versionNumberOfTheFileFreeSpaceInformation = 0;
+			superblock.versionOfRootGroupSymbolTableEntry = 0;
+			superblock.versionOfSharedHeaderMessageFormat = 0;
+			superblock.sizeOfOffsets = 8;
+			superblock.sizeOfLengths = 8;
+			superblock.groupLeafNodeK = 4;
+			superblock.groupInternalNodeK = 16;
+			superblock.baseAddressByte = 0;
+			superblock.addressOfGlobalFreeSpaceIndex = -1;
+			superblock.endOfFileAddress = 2048 + (row * col * 8); // double value
+			superblock.driverInformationBlockAddress = -1;
+			superblock.rootGroupSymbolTableAddress = 56;
+
+			rootObject.setSuperblock(superblock);
+			rootObject.setRank(2);
+			rootObject.setCol(col);
+			rootObject.setRow(row);
+
+			superblock.toBuffer(rootObject.bufferBuilder);
+
+			H5SymbolTableEntry symbolTableEntry = new H5SymbolTableEntry(rootObject);
+			symbolTableEntry.toBuffer(rootObject.bufferBuilder);
+
+			return rootObject;
+
+		}
+		catch(Exception exception) {
+			throw new H5Exception(exception);
+		}
+	}
+
+	// Open a Data Space
+	public static H5ContiguousDataset H5Dopen(H5RootObject rootObject, String datasetName) {
+		try {
+			H5SymbolTableEntry symbolTableEntry = new H5SymbolTableEntry(rootObject,
+				rootObject.getSuperblock().rootGroupSymbolTableAddress - rootObject.getSuperblock().baseAddressByte);
+
+			H5ObjectHeader objectHeader = new H5ObjectHeader(rootObject, symbolTableEntry.getObjectHeaderAddress());
+
+			final H5SymbolTableMessage stm = (H5SymbolTableMessage) objectHeader.getMessages().get(0);
+			final H5BTree rootBTreeNode = new H5BTree(rootObject, stm.getbTreeAddress());
+			final H5LocalHeap rootNameHeap = new H5LocalHeap(rootObject, stm.getLocalHeapAddress());
+			final ByteBuffer nameBuffer = rootNameHeap.getDataBuffer();
+			final List<Long> childAddresses = rootBTreeNode.getChildAddresses();
+
+			long child = childAddresses.get(0);
+
+			H5GroupSymbolTableNode groupSTE = new H5GroupSymbolTableNode(rootObject, child);
+
+			symbolTableEntry = groupSTE.getSymbolTableEntries()[0];
+
+			nameBuffer.position(symbolTableEntry.getLinkNameOffset());
+			String childName = Utils.readUntilNull(nameBuffer);
+
+			if(!childName.equals(datasetName)) {
+				throw new H5Exception("The dataset name '" + datasetName + "' not found!");
+			}
+
+			final H5ObjectHeader header = new H5ObjectHeader(rootObject, symbolTableEntry.getObjectHeaderAddress());
+			final H5ContiguousDataset contiguousDataset = new H5ContiguousDataset(rootObject, header);
+			return contiguousDataset;
+
+		}
+		catch(Exception exception) {
+			throw new H5Exception(exception);
+		}
+	}
+
+	// Create Dataset
+	public static void H5Dcreate(H5RootObject rootObject, long maxRow, long maxCol, String datasetName) {
+
+		if(rootObject.getRank() == 2) {
+
+			rootObject.setMaxRow(maxRow);
+			rootObject.setMaxCol(maxCol);
+			rootObject.setDatasetName(datasetName);
+			H5ObjectHeader objectHeader = new H5ObjectHeader(rootObject, datasetName);
+			objectHeader.toBuffer(rootObject.bufferBuilder);
+			rootObject.bufferBuilder.goToPositionWithWriteZero(2048);
+
+		}
+		else
+			throw new H5Exception("Just support Matrix!");
+	}
+
+	public static void H5WriteHeaders(H5RootObject rootObject) {
+		try {
+			rootObject.getBufferedOutputStream().write(rootObject.bufferBuilder.build().array());
+		}
+		catch(Exception exception) {
+			throw new H5Exception(exception);
+		}
+	}
+
+	// Write Data
+	public static void H5Dwrite(H5RootObject rootObject, double[] data) {
+		try {
+			H5BufferBuilder bb = new H5BufferBuilder();
+			for(Double d : data) {
+				bb.writeDouble(d);
+			}
+			rootObject.getBufferedOutputStream().write(bb.noOrderBuild().array());
+		}
+		catch(Exception exception) {
+			throw new H5Exception(exception);
+		}
+	}
+
+	public static void H5Dwrite(H5RootObject rootObject, double[][] data) {
+
+		long dataSize = rootObject.getRow() * rootObject.getCol() * 8; // 8 value is size of double
+		int maxBufferSize = (int) (Integer.MAX_VALUE * 0.2); // max buffer size is ratio of data transfer
+		int bufferSize = (int) Math.min(maxBufferSize, dataSize);
+
+		H5BufferBuilder bb = new H5BufferBuilder();
+		try {
+			for(int i = 0; i < rootObject.getRow(); i++) {
+				for(int j = 0; j < rootObject.getCol(); j++) {
+
+					// if the buffer is full then flush buffer into file and reseat the buffer
+					if(bb.getSize() + 8 > bufferSize) {
+						rootObject.getBufferedOutputStream().write(bb.build().array());
+						bb = new H5BufferBuilder();
+					}
+					bb.writeDouble(data[i][j]);
+				}
+			}
+			// write last data to buffer
+			rootObject.getBufferedOutputStream().write(bb.build().array());
+		}
+		catch(Exception exception) {
+			throw new H5Exception(exception);
+		}
+	}
+
+	public static double[][] H5Dread(H5RootObject rootObject, H5ContiguousDataset dataset) {

Review comment:
       Same here, it would be nicer if you don't read into a intermediate format of nested array of double array

##########
File path: src/main/java/org/apache/sysds/runtime/io/ReaderHDF5.java
##########
@@ -0,0 +1,169 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.io;
+
+import org.apache.commons.lang.mutable.MutableInt;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.sysds.conf.ConfigurationManager;
+import org.apache.sysds.runtime.DMLRuntimeException;
+import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.io.hdf5.H5;
+import org.apache.sysds.runtime.io.hdf5.H5ContiguousDataset;
+import org.apache.sysds.runtime.io.hdf5.H5RootObject;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+public class ReaderHDF5 extends MatrixReader {
+	protected final FileFormatPropertiesHDF5 _props;
+
+	public ReaderHDF5(FileFormatPropertiesHDF5 props) {
+		_props = props;
+	}
+
+	@Override public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int blen, long estnnz)
+		throws IOException, DMLRuntimeException {
+		//allocate output matrix block
+		MatrixBlock ret = null;
+		if(rlen >= 0 && clen >= 0) //otherwise allocated on read
+			ret = createOutputMatrixBlock(rlen, clen, (int) rlen, estnnz, true, false);
+
+		//prepare file access
+		JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
+		Path path = new Path(fname);
+		FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
+
+		//check existence and non-empty file
+		checkValidInputFile(fs, path);
+		//core read 
+		ret = readHDF5MatrixFromHDFS(path, job, fs, ret, rlen, clen, blen, _props.getDatasetName());
+
+		//finally check if change of sparse/dense block representation required
+		//(nnz explicitly maintained during read)
+		ret.examSparsity();
+
+		return ret;
+	}
+
+	@Override public MatrixBlock readMatrixFromInputStream(InputStream is, long rlen, long clen, int blen, long estnnz)
+		throws IOException, DMLRuntimeException {
+		//allocate output matrix block
+		MatrixBlock ret = createOutputMatrixBlock(rlen, clen, (int) rlen, estnnz, true, false);
+
+		//core read
+		String datasetName = _props.getDatasetName();
+
+		BufferedInputStream bis = new BufferedInputStream(is);
+
+		long lnnz = readMatrixFromHDF5(bis, datasetName, ret, new MutableInt(0), rlen, clen, blen);
+
+		//finally check if change of sparse/dense block representation required
+		ret.setNonZeros(lnnz);
+		ret.examSparsity();
+
+		return ret;
+	}
+
+	@SuppressWarnings("unchecked") private static MatrixBlock readHDF5MatrixFromHDFS(Path path, JobConf job,

Review comment:
       move \@...  to the line above.

##########
File path: src/main/java/org/apache/sysds/runtime/io/ReaderHDF5.java
##########
@@ -0,0 +1,169 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.io;
+
+import org.apache.commons.lang.mutable.MutableInt;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.sysds.conf.ConfigurationManager;
+import org.apache.sysds.runtime.DMLRuntimeException;
+import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.io.hdf5.H5;
+import org.apache.sysds.runtime.io.hdf5.H5ContiguousDataset;
+import org.apache.sysds.runtime.io.hdf5.H5RootObject;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+public class ReaderHDF5 extends MatrixReader {
+	protected final FileFormatPropertiesHDF5 _props;
+
+	public ReaderHDF5(FileFormatPropertiesHDF5 props) {
+		_props = props;
+	}
+
+	@Override public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int blen, long estnnz)

Review comment:
       move \@Override to line above

##########
File path: src/main/java/org/apache/sysds/runtime/io/ReaderHDF5Parallel.java
##########
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.io;
+
+import org.apache.commons.lang.mutable.MutableInt;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.TextInputFormat;
+import org.apache.sysds.conf.ConfigurationManager;
+import org.apache.sysds.hops.OptimizerUtils;
+import org.apache.sysds.runtime.DMLRuntimeException;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.util.CommonThreadPool;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Future;
+
+public class ReaderHDF5Parallel extends ReaderHDF5 {
+
+	final private int _numThreads;
+	protected JobConf _job;
+
+	public ReaderHDF5Parallel(FileFormatPropertiesHDF5 props) {
+		super(props);
+		_numThreads = OptimizerUtils.getParallelTextReadParallelism();
+	}
+
+	@Override public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int blen, long estnnz)
+		throws IOException, DMLRuntimeException {
+
+		// prepare file access
+		_job = new JobConf(ConfigurationManager.getCachedJobConf());
+
+		Path path = new Path(fname);
+		FileSystem fs = IOUtilFunctions.getFileSystem(path, _job);
+
+		FileInputFormat.addInputPath(_job, path);
+		TextInputFormat informat = new TextInputFormat();
+		informat.configure(_job);
+
+		// check existence and non-empty file
+		checkValidInputFile(fs, path);
+
+		// allocate output matrix block
+		ArrayList<Path> files = new ArrayList<>();
+		files.add(path);
+		MatrixBlock src = computeHDF5Size(files, fs, _props.getDatasetName());
+
+		//create and execute tasks
+		try {
+			ExecutorService pool = CommonThreadPool.get(_numThreads);
+			ArrayList<ReadHDF5Task> tasks = new ArrayList<>();
+			rlen = src.getNumRows();
+			int blklen = (int) Math.ceil((double) rlen / _numThreads);
+			for(int i = 0; i < _numThreads & i * blklen < rlen; i++) {
+				BufferedInputStream bis = new BufferedInputStream(fs.open(path));
+				MutableInt row = new MutableInt(i * blklen);

Review comment:
       why do you need a MutableInt? is an int not sufficient.

##########
File path: src/main/java/org/apache/sysds/runtime/io/WriterHDF5.java
##########
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.io;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.sysds.conf.ConfigurationManager;
+import org.apache.sysds.runtime.DMLRuntimeException;
+import org.apache.sysds.runtime.io.hdf5.H5;
+import org.apache.sysds.runtime.io.hdf5.H5RootObject;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.util.HDFSTool;
+
+import java.io.BufferedOutputStream;
+import java.io.IOException;
+
+public class WriterHDF5 extends MatrixWriter {
+
+	protected static FileFormatPropertiesHDF5 _props = null;
+
+	protected int _replication = -1;
+
+	public static final int BLOCKSIZE_J = 32; //32 cells (typically ~512B, should be less than write buffer of 1KB)

Review comment:
       B -> Byte
   b -> bit.
   
   I think you there is something wrong with the comment.
   if the 32 cells are doubles, then if you mean Bytes, then 32*8 = 256B
   does the cells fill 16Bytes per value?
   
   Also write buffer is that 1KB ? (idk)

##########
File path: src/main/java/org/apache/sysds/runtime/io/ReaderHDF5Parallel.java
##########
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.io;
+
+import org.apache.commons.lang.mutable.MutableInt;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.TextInputFormat;
+import org.apache.sysds.conf.ConfigurationManager;
+import org.apache.sysds.hops.OptimizerUtils;
+import org.apache.sysds.runtime.DMLRuntimeException;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.util.CommonThreadPool;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Future;
+
+public class ReaderHDF5Parallel extends ReaderHDF5 {
+
+	final private int _numThreads;
+	protected JobConf _job;
+
+	public ReaderHDF5Parallel(FileFormatPropertiesHDF5 props) {
+		super(props);
+		_numThreads = OptimizerUtils.getParallelTextReadParallelism();
+	}
+
+	@Override public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int blen, long estnnz)

Review comment:
       \@Override on line above.

##########
File path: src/main/java/org/apache/sysds/runtime/io/ReaderHDF5.java
##########
@@ -0,0 +1,169 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.io;
+
+import org.apache.commons.lang.mutable.MutableInt;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.sysds.conf.ConfigurationManager;
+import org.apache.sysds.runtime.DMLRuntimeException;
+import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.io.hdf5.H5;
+import org.apache.sysds.runtime.io.hdf5.H5ContiguousDataset;
+import org.apache.sysds.runtime.io.hdf5.H5RootObject;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+public class ReaderHDF5 extends MatrixReader {
+	protected final FileFormatPropertiesHDF5 _props;
+
+	public ReaderHDF5(FileFormatPropertiesHDF5 props) {
+		_props = props;
+	}
+
+	@Override public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int blen, long estnnz)
+		throws IOException, DMLRuntimeException {
+		//allocate output matrix block
+		MatrixBlock ret = null;
+		if(rlen >= 0 && clen >= 0) //otherwise allocated on read
+			ret = createOutputMatrixBlock(rlen, clen, (int) rlen, estnnz, true, false);
+
+		//prepare file access
+		JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
+		Path path = new Path(fname);
+		FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
+
+		//check existence and non-empty file
+		checkValidInputFile(fs, path);
+		//core read 
+		ret = readHDF5MatrixFromHDFS(path, job, fs, ret, rlen, clen, blen, _props.getDatasetName());
+
+		//finally check if change of sparse/dense block representation required
+		//(nnz explicitly maintained during read)
+		ret.examSparsity();
+
+		return ret;
+	}
+
+	@Override public MatrixBlock readMatrixFromInputStream(InputStream is, long rlen, long clen, int blen, long estnnz)
+		throws IOException, DMLRuntimeException {
+		//allocate output matrix block
+		MatrixBlock ret = createOutputMatrixBlock(rlen, clen, (int) rlen, estnnz, true, false);
+
+		//core read
+		String datasetName = _props.getDatasetName();
+
+		BufferedInputStream bis = new BufferedInputStream(is);
+
+		long lnnz = readMatrixFromHDF5(bis, datasetName, ret, new MutableInt(0), rlen, clen, blen);
+
+		//finally check if change of sparse/dense block representation required
+		ret.setNonZeros(lnnz);
+		ret.examSparsity();
+
+		return ret;
+	}
+
+	@SuppressWarnings("unchecked") private static MatrixBlock readHDF5MatrixFromHDFS(Path path, JobConf job,
+		FileSystem fs, MatrixBlock dest, long rlen, long clen, int blen, String datasetName)
+		throws IOException, DMLRuntimeException {
+		//prepare file paths in alphanumeric order
+		ArrayList<Path> files = new ArrayList<>();
+		if(fs.isDirectory(path)) {
+			for(FileStatus stat : fs.listStatus(path, IOUtilFunctions.hiddenFileFilter))
+				files.add(stat.getPath());
+			Collections.sort(files);
+		}
+		else
+			files.add(path);
+
+		//determine matrix size via additional pass if required
+		if(dest == null) {
+			dest = computeHDF5Size(files, fs, datasetName);
+			clen = dest.getNumColumns();
+			rlen = dest.getNumRows();
+		}
+
+		//actual read of individual files
+		long lnnz = 0;
+		MutableInt row = new MutableInt(0);
+		for(int fileNo = 0; fileNo < files.size(); fileNo++) {
+			BufferedInputStream bis = new BufferedInputStream(fs.open(files.get(fileNo)));
+			lnnz += readMatrixFromHDF5(bis, datasetName, dest, row, rlen, clen, blen);
+		}
+		//post processing
+		dest.setNonZeros(lnnz);
+
+		return dest;
+	}
+
+	public static long readMatrixFromHDF5(BufferedInputStream bis, String datasetName, MatrixBlock dest,
+		MutableInt rowPos, long rlen, long clen, int blen) {
+		int row = rowPos.intValue();
+		long lnnz = 0;
+		H5RootObject rootObject = H5.H5Fopen(bis);
+		H5ContiguousDataset contiguousDataset = H5.H5Dopen(rootObject, datasetName);
+
+		int[] dims = rootObject.getDimensions();
+		int ncol = dims[1];
+
+		DenseBlock denseBlock = dest.getDenseBlock();
+		for(int i = row; i < rlen; i++) {
+			double[] rowData = H5.H5Dread(rootObject, contiguousDataset, i);

Review comment:
       so each row is allocated as an array?
   if so then you might get better performance if you are able to read a double at a time,
   and then setting them into the matrix, since allocating this double array cost extra computation, alternatively if you want to keep the array, then the H5Dread should reuse the same array each return. (i don't know if you do this already)

##########
File path: src/main/java/org/apache/sysds/runtime/io/hdf5/H5.java
##########
@@ -0,0 +1,234 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.io.hdf5;
+
+import org.apache.sysds.runtime.io.hdf5.message.H5SymbolTableMessage;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.nio.ByteBuffer;
+import java.util.List;
+
+public class H5 {
+
+	// H5 format write/read steps:
+	// 1. Create/Open a File (H5Fcreate)
+	// 2. Create/open a Dataspace
+	// 3. Create/Open a Dataset
+	// 4. Write/Read
+	// 5. Close File
+
+	public static H5RootObject H5Fopen(BufferedInputStream bis) {
+		bis.mark(0);
+
+		H5RootObject rootObject = new H5RootObject();
+		try {
+			// Find out if the file is a HDF5 file
+			int maxSignatureLength = 2048;
+			boolean validSignature = false;
+			long offset;
+			for(offset = 0; offset < maxSignatureLength; offset = nextOffset(offset)) {
+				validSignature = H5Superblock.verifySignature(bis, offset);
+				if(validSignature) {
+					break;
+				}
+			}
+			if(!validSignature) {
+				throw new H5Exception("No valid HDF5 signature found");
+			}
+			rootObject.setBufferedInputStream(bis);
+
+			final H5Superblock superblock = new H5Superblock(bis, offset);
+			rootObject.setSuperblock(superblock);
+		}
+		catch(Exception exception) {
+			//exception.printStackTrace();
+			throw new H5Exception("Can't open fine " + exception);
+		}
+		return rootObject;
+	}
+
+	private static long nextOffset(long offset) {
+		if(offset == 0) {
+			return 512L;
+		}
+		return offset * 2;
+	}
+
+	// Create Data Space
+	public static H5RootObject H5Screate(BufferedOutputStream bos, long row, long col) {
+
+		try {
+			H5RootObject rootObject = new H5RootObject();
+			rootObject.setBufferedOutputStream(bos);
+			rootObject.bufferBuilder = new H5BufferBuilder();
+			final H5Superblock superblock = new H5Superblock();
+			superblock.versionOfSuperblock = 0;
+			superblock.versionNumberOfTheFileFreeSpaceInformation = 0;
+			superblock.versionOfRootGroupSymbolTableEntry = 0;
+			superblock.versionOfSharedHeaderMessageFormat = 0;
+			superblock.sizeOfOffsets = 8;
+			superblock.sizeOfLengths = 8;
+			superblock.groupLeafNodeK = 4;
+			superblock.groupInternalNodeK = 16;
+			superblock.baseAddressByte = 0;
+			superblock.addressOfGlobalFreeSpaceIndex = -1;
+			superblock.endOfFileAddress = 2048 + (row * col * 8); // double value
+			superblock.driverInformationBlockAddress = -1;
+			superblock.rootGroupSymbolTableAddress = 56;
+
+			rootObject.setSuperblock(superblock);
+			rootObject.setRank(2);
+			rootObject.setCol(col);
+			rootObject.setRow(row);
+
+			superblock.toBuffer(rootObject.bufferBuilder);
+
+			H5SymbolTableEntry symbolTableEntry = new H5SymbolTableEntry(rootObject);
+			symbolTableEntry.toBuffer(rootObject.bufferBuilder);
+
+			return rootObject;
+
+		}
+		catch(Exception exception) {
+			throw new H5Exception(exception);
+		}
+	}
+
+	// Open a Data Space
+	public static H5ContiguousDataset H5Dopen(H5RootObject rootObject, String datasetName) {
+		try {
+			H5SymbolTableEntry symbolTableEntry = new H5SymbolTableEntry(rootObject,
+				rootObject.getSuperblock().rootGroupSymbolTableAddress - rootObject.getSuperblock().baseAddressByte);
+
+			H5ObjectHeader objectHeader = new H5ObjectHeader(rootObject, symbolTableEntry.getObjectHeaderAddress());
+
+			final H5SymbolTableMessage stm = (H5SymbolTableMessage) objectHeader.getMessages().get(0);
+			final H5BTree rootBTreeNode = new H5BTree(rootObject, stm.getbTreeAddress());
+			final H5LocalHeap rootNameHeap = new H5LocalHeap(rootObject, stm.getLocalHeapAddress());
+			final ByteBuffer nameBuffer = rootNameHeap.getDataBuffer();
+			final List<Long> childAddresses = rootBTreeNode.getChildAddresses();
+
+			long child = childAddresses.get(0);
+
+			H5GroupSymbolTableNode groupSTE = new H5GroupSymbolTableNode(rootObject, child);
+
+			symbolTableEntry = groupSTE.getSymbolTableEntries()[0];
+
+			nameBuffer.position(symbolTableEntry.getLinkNameOffset());
+			String childName = Utils.readUntilNull(nameBuffer);
+
+			if(!childName.equals(datasetName)) {
+				throw new H5Exception("The dataset name '" + datasetName + "' not found!");
+			}
+
+			final H5ObjectHeader header = new H5ObjectHeader(rootObject, symbolTableEntry.getObjectHeaderAddress());
+			final H5ContiguousDataset contiguousDataset = new H5ContiguousDataset(rootObject, header);
+			return contiguousDataset;
+
+		}
+		catch(Exception exception) {
+			throw new H5Exception(exception);
+		}
+	}
+
+	// Create Dataset
+	public static void H5Dcreate(H5RootObject rootObject, long maxRow, long maxCol, String datasetName) {
+
+		if(rootObject.getRank() == 2) {
+
+			rootObject.setMaxRow(maxRow);
+			rootObject.setMaxCol(maxCol);
+			rootObject.setDatasetName(datasetName);
+			H5ObjectHeader objectHeader = new H5ObjectHeader(rootObject, datasetName);
+			objectHeader.toBuffer(rootObject.bufferBuilder);
+			rootObject.bufferBuilder.goToPositionWithWriteZero(2048);
+
+		}
+		else
+			throw new H5Exception("Just support Matrix!");
+	}
+
+	public static void H5WriteHeaders(H5RootObject rootObject) {
+		try {
+			rootObject.getBufferedOutputStream().write(rootObject.bufferBuilder.build().array());
+		}
+		catch(Exception exception) {
+			throw new H5Exception(exception);
+		}
+	}
+
+	// Write Data
+	public static void H5Dwrite(H5RootObject rootObject, double[] data) {
+		try {
+			H5BufferBuilder bb = new H5BufferBuilder();
+			for(Double d : data) {
+				bb.writeDouble(d);
+			}
+			rootObject.getBufferedOutputStream().write(bb.noOrderBuild().array());
+		}
+		catch(Exception exception) {
+			throw new H5Exception(exception);
+		}
+	}
+
+	public static void H5Dwrite(H5RootObject rootObject, double[][] data) {

Review comment:
       I Hope this method is not used inside the system (and just for testing),
   since it implies that you change the data structure to a nested array of double arrays.

##########
File path: src/main/java/org/apache/sysds/runtime/io/WriterHDF5.java
##########
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.io;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.sysds.conf.ConfigurationManager;
+import org.apache.sysds.runtime.DMLRuntimeException;
+import org.apache.sysds.runtime.io.hdf5.H5;
+import org.apache.sysds.runtime.io.hdf5.H5RootObject;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.util.HDFSTool;
+
+import java.io.BufferedOutputStream;
+import java.io.IOException;
+
+public class WriterHDF5 extends MatrixWriter {
+
+	protected static FileFormatPropertiesHDF5 _props = null;
+
+	protected int _replication = -1;
+
+	public static final int BLOCKSIZE_J = 32; //32 cells (typically ~512B, should be less than write buffer of 1KB)
+
+	public WriterHDF5(FileFormatPropertiesHDF5 _props) {
+		WriterHDF5._props = _props;
+	}
+
+	@Override public void writeMatrixToHDFS(MatrixBlock src, String fname, long rlen, long clen, int blen, long nnz,
+		boolean diag) throws IOException, DMLRuntimeException {
+
+		//validity check matrix dimensions
+		if(src.getNumRows() != rlen || src.getNumColumns() != clen)
+			throw new IOException("Matrix dimensions mismatch with metadata: " + src.getNumRows() + "x" + src
+				.getNumColumns() + " vs " + rlen + "x" + clen + ".");
+		if(rlen == 0 || clen == 0)
+			throw new IOException(
+				"Write of matrices with zero rows or columns not supported (" + rlen + "x" + clen + ").");
+
+		//prepare file access
+		JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
+		Path path = new Path(fname);
+		FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
+
+		//if the file already exists on HDFS, remove it.
+		HDFSTool.deleteFileIfExistOnHDFS(fname);
+
+		//core write (sequential/parallel)
+		writeHDF5MatrixToHDFS(path, job, fs, src);
+
+		IOUtilFunctions.deleteCrcFilesFromLocalFileSystem(fs, path);
+
+	}
+
+	@Override public final void writeEmptyMatrixToHDFS(String fname, long rlen, long clen, int blen)
+		throws IOException, DMLRuntimeException {
+
+	}
+
+	protected void writeHDF5MatrixToHDFS(Path path, JobConf job, FileSystem fs, MatrixBlock src) throws IOException {
+		//sequential write HDF5 file
+		writeHDF5MatrixToFile(path, job, fs, src, 0, src.getNumRows());
+	}
+
+	protected static void writeHDF5MatrixToFile(Path path, JobConf job, FileSystem fs, MatrixBlock src, int rl,
+		int rlen) throws IOException {
+
+		int clen = src.getNumColumns();
+
+		BufferedOutputStream bos = new BufferedOutputStream(fs.create(path, true));
+		String datasetName = _props.getDatasetName();
+		H5RootObject rootObject = H5.H5Screate(bos, src.getNumRows(), src.getNumColumns());
+		H5.H5Dcreate(rootObject, src.getNumRows(), src.getNumColumns(), datasetName);
+
+		//write headers
+		if(rl == 0) {
+			H5.H5WriteHeaders(rootObject);
+		}
+
+		try {
+			// HDF5 format don't support spars matrix
+			// Write the data to the datasets.
+			for(int i = rl; i < rlen; i++) {
+				double[] dataRow = new double[clen];
+				//write row chunk-wise to prevent OOM on large number of columns
+				for(int bj = 0; bj < clen; bj += BLOCKSIZE_J) {
+					for(int j = bj; j < Math.min(clen, bj + BLOCKSIZE_J); j++) {
+						double lvalue = src.getValueDenseUnsafe(i, j);

Review comment:
       This seams unsafe, 
   i would suggest checking the Matrix Block if it is dense or sparse, and then construct iterators for each.
   
   This here also leads me to suspect that you don't have tests of writing sparse matrices.
   

##########
File path: src/main/java/org/apache/sysds/runtime/io/hdf5/H5.java
##########
@@ -0,0 +1,234 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.io.hdf5;
+
+import org.apache.sysds.runtime.io.hdf5.message.H5SymbolTableMessage;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.nio.ByteBuffer;
+import java.util.List;
+
+public class H5 {
+
+	// H5 format write/read steps:
+	// 1. Create/Open a File (H5Fcreate)
+	// 2. Create/open a Dataspace
+	// 3. Create/Open a Dataset
+	// 4. Write/Read
+	// 5. Close File
+
+	public static H5RootObject H5Fopen(BufferedInputStream bis) {
+		bis.mark(0);
+
+		H5RootObject rootObject = new H5RootObject();
+		try {
+			// Find out if the file is a HDF5 file
+			int maxSignatureLength = 2048;
+			boolean validSignature = false;
+			long offset;
+			for(offset = 0; offset < maxSignatureLength; offset = nextOffset(offset)) {
+				validSignature = H5Superblock.verifySignature(bis, offset);
+				if(validSignature) {
+					break;
+				}
+			}
+			if(!validSignature) {
+				throw new H5Exception("No valid HDF5 signature found");
+			}
+			rootObject.setBufferedInputStream(bis);
+
+			final H5Superblock superblock = new H5Superblock(bis, offset);
+			rootObject.setSuperblock(superblock);
+		}
+		catch(Exception exception) {
+			//exception.printStackTrace();
+			throw new H5Exception("Can't open fine " + exception);
+		}
+		return rootObject;
+	}
+
+	private static long nextOffset(long offset) {
+		if(offset == 0) {
+			return 512L;
+		}
+		return offset * 2;
+	}
+
+	// Create Data Space
+	public static H5RootObject H5Screate(BufferedOutputStream bos, long row, long col) {
+
+		try {
+			H5RootObject rootObject = new H5RootObject();
+			rootObject.setBufferedOutputStream(bos);
+			rootObject.bufferBuilder = new H5BufferBuilder();
+			final H5Superblock superblock = new H5Superblock();
+			superblock.versionOfSuperblock = 0;
+			superblock.versionNumberOfTheFileFreeSpaceInformation = 0;
+			superblock.versionOfRootGroupSymbolTableEntry = 0;
+			superblock.versionOfSharedHeaderMessageFormat = 0;
+			superblock.sizeOfOffsets = 8;
+			superblock.sizeOfLengths = 8;
+			superblock.groupLeafNodeK = 4;
+			superblock.groupInternalNodeK = 16;
+			superblock.baseAddressByte = 0;
+			superblock.addressOfGlobalFreeSpaceIndex = -1;
+			superblock.endOfFileAddress = 2048 + (row * col * 8); // double value
+			superblock.driverInformationBlockAddress = -1;
+			superblock.rootGroupSymbolTableAddress = 56;
+
+			rootObject.setSuperblock(superblock);
+			rootObject.setRank(2);
+			rootObject.setCol(col);
+			rootObject.setRow(row);
+
+			superblock.toBuffer(rootObject.bufferBuilder);
+
+			H5SymbolTableEntry symbolTableEntry = new H5SymbolTableEntry(rootObject);
+			symbolTableEntry.toBuffer(rootObject.bufferBuilder);
+
+			return rootObject;
+
+		}
+		catch(Exception exception) {
+			throw new H5Exception(exception);
+		}
+	}
+
+	// Open a Data Space
+	public static H5ContiguousDataset H5Dopen(H5RootObject rootObject, String datasetName) {
+		try {
+			H5SymbolTableEntry symbolTableEntry = new H5SymbolTableEntry(rootObject,
+				rootObject.getSuperblock().rootGroupSymbolTableAddress - rootObject.getSuperblock().baseAddressByte);
+
+			H5ObjectHeader objectHeader = new H5ObjectHeader(rootObject, symbolTableEntry.getObjectHeaderAddress());
+
+			final H5SymbolTableMessage stm = (H5SymbolTableMessage) objectHeader.getMessages().get(0);
+			final H5BTree rootBTreeNode = new H5BTree(rootObject, stm.getbTreeAddress());
+			final H5LocalHeap rootNameHeap = new H5LocalHeap(rootObject, stm.getLocalHeapAddress());
+			final ByteBuffer nameBuffer = rootNameHeap.getDataBuffer();
+			final List<Long> childAddresses = rootBTreeNode.getChildAddresses();
+
+			long child = childAddresses.get(0);
+
+			H5GroupSymbolTableNode groupSTE = new H5GroupSymbolTableNode(rootObject, child);
+
+			symbolTableEntry = groupSTE.getSymbolTableEntries()[0];
+
+			nameBuffer.position(symbolTableEntry.getLinkNameOffset());
+			String childName = Utils.readUntilNull(nameBuffer);
+
+			if(!childName.equals(datasetName)) {
+				throw new H5Exception("The dataset name '" + datasetName + "' not found!");
+			}
+
+			final H5ObjectHeader header = new H5ObjectHeader(rootObject, symbolTableEntry.getObjectHeaderAddress());
+			final H5ContiguousDataset contiguousDataset = new H5ContiguousDataset(rootObject, header);
+			return contiguousDataset;
+
+		}
+		catch(Exception exception) {
+			throw new H5Exception(exception);
+		}
+	}
+
+	// Create Dataset
+	public static void H5Dcreate(H5RootObject rootObject, long maxRow, long maxCol, String datasetName) {
+
+		if(rootObject.getRank() == 2) {
+
+			rootObject.setMaxRow(maxRow);
+			rootObject.setMaxCol(maxCol);
+			rootObject.setDatasetName(datasetName);
+			H5ObjectHeader objectHeader = new H5ObjectHeader(rootObject, datasetName);
+			objectHeader.toBuffer(rootObject.bufferBuilder);
+			rootObject.bufferBuilder.goToPositionWithWriteZero(2048);
+
+		}
+		else
+			throw new H5Exception("Just support Matrix!");
+	}
+
+	public static void H5WriteHeaders(H5RootObject rootObject) {
+		try {
+			rootObject.getBufferedOutputStream().write(rootObject.bufferBuilder.build().array());
+		}
+		catch(Exception exception) {
+			throw new H5Exception(exception);
+		}
+	}
+
+	// Write Data
+	public static void H5Dwrite(H5RootObject rootObject, double[] data) {
+		try {
+			H5BufferBuilder bb = new H5BufferBuilder();
+			for(Double d : data) {
+				bb.writeDouble(d);
+			}
+			rootObject.getBufferedOutputStream().write(bb.noOrderBuild().array());
+		}
+		catch(Exception exception) {
+			throw new H5Exception(exception);
+		}
+	}
+
+	public static void H5Dwrite(H5RootObject rootObject, double[][] data) {
+
+		long dataSize = rootObject.getRow() * rootObject.getCol() * 8; // 8 value is size of double
+		int maxBufferSize = (int) (Integer.MAX_VALUE * 0.2); // max buffer size is ratio of data transfer
+		int bufferSize = (int) Math.min(maxBufferSize, dataSize);
+
+		H5BufferBuilder bb = new H5BufferBuilder();
+		try {
+			for(int i = 0; i < rootObject.getRow(); i++) {
+				for(int j = 0; j < rootObject.getCol(); j++) {
+
+					// if the buffer is full then flush buffer into file and reseat the buffer
+					if(bb.getSize() + 8 > bufferSize) {
+						rootObject.getBufferedOutputStream().write(bb.build().array());
+						bb = new H5BufferBuilder();
+					}
+					bb.writeDouble(data[i][j]);
+				}
+			}
+			// write last data to buffer
+			rootObject.getBufferedOutputStream().write(bb.build().array());
+		}
+		catch(Exception exception) {
+			throw new H5Exception(exception);
+		}
+	}
+
+	public static double[][] H5Dread(H5RootObject rootObject, H5ContiguousDataset dataset) {
+
+		ByteBuffer buffer = dataset.getDataBuffer();
+		int[] dimensions = rootObject.getDimensions();
+		final double[][] data = dataset.getDataType().getDoubleDataType().fillData(buffer, dimensions);
+		return data;
+	}
+
+	public static double[] H5Dread(H5RootObject rootObject, H5ContiguousDataset dataset, int row) {

Review comment:
       Reading a row at a time is fine, but allocation of these double arrays, could hopefully be avoided, perhaps extend this method to take the double[] as input, and then allocate the parsed data into it?

##########
File path: src/main/java/org/apache/sysds/runtime/io/hdf5/Utils.java
##########
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+package org.apache.sysds.runtime.io.hdf5;
+
+import org.apache.commons.lang3.ArrayUtils;
+
+import java.math.BigInteger;
+import java.nio.ByteBuffer;
+import java.util.BitSet;
+
+import static java.nio.ByteOrder.LITTLE_ENDIAN;
+
+public final class Utils {
+
+	private Utils() {
+		throw new AssertionError("No instances of Utils");
+	}
+
+	public static String readUntilNull(ByteBuffer buffer) {
+		StringBuilder sb = new StringBuilder(buffer.remaining());
+		while(buffer.hasRemaining()) {
+			byte b = buffer.get();
+			if(b == H5Constants.NULL) {
+				return sb.toString();
+			}
+			sb.append((char) b);
+		}
+		throw new IllegalArgumentException("End of buffer reached before NULL");
+	}
+
+	public static void seekBufferToNextMultipleOfEight(ByteBuffer bb) {
+		int pos = bb.position();
+		if(pos % 8 == 0) {
+			return; // Already on a 8 byte multiple
+		}
+		bb.position(pos + (8 - (pos % 8)));
+	}
+
+	public static int readBytesAsUnsignedInt(ByteBuffer buffer, int length) {
+		switch(length) {
+			case 1:
+				return Byte.toUnsignedInt(buffer.get());
+			case 2:
+				return Short.toUnsignedInt(buffer.getShort());
+			case 3:
+				return readArbitraryLengthBytesAsUnsignedInt(buffer, length);
+			case 4:
+				int value = buffer.getInt();
+				if(value < 0) {
+					throw new ArithmeticException("Could not convert to unsigned");
+				}
+				return value;
+			case 5:
+			case 6:
+			case 7:
+				return readArbitraryLengthBytesAsUnsignedInt(buffer, length);
+			case 8:
+				// Throws if the long can't be converted safely
+				return Math.toIntExact(buffer.getLong());
+			default:
+				throw new IllegalArgumentException("Couldn't read " + length + " bytes as int");
+		}
+	}
+
+	private static int readArbitraryLengthBytesAsUnsignedInt(ByteBuffer buffer, int length) {
+		byte[] bytes = new byte[length];
+		buffer.get(bytes);
+		if(buffer.order() == LITTLE_ENDIAN) {
+			ArrayUtils.reverse(bytes);
+		}
+		return new BigInteger(1, bytes).intValueExact();
+	}
+
+	public static long readBytesAsUnsignedLong(ByteBuffer buffer, int length) {
+		switch(length) {
+			case 1:
+				return Byte.toUnsignedLong(buffer.get());
+			case 2:
+				return Short.toUnsignedLong(buffer.getShort());
+			case 3:
+				return readArbitraryLengthBytesAsUnsignedLong(buffer, length);
+			case 4:
+				return Integer.toUnsignedLong(buffer.getInt());
+			case 5:
+			case 6:
+			case 7:
+				return readArbitraryLengthBytesAsUnsignedLong(buffer, length);
+			case 8:
+				long value = buffer.getLong();
+				if(value < 0 && value != H5Constants.UNDEFINED_ADDRESS) {
+					throw new ArithmeticException("Could not convert to unsigned");
+				}
+				return value;
+			default:
+				throw new IllegalArgumentException("Couldn't read " + length + " bytes as int");
+		}
+	}
+
+	private static long readArbitraryLengthBytesAsUnsignedLong(ByteBuffer buffer, int length) {
+		byte[] bytes = new byte[length];
+		buffer.get(bytes);
+		if(buffer.order() == LITTLE_ENDIAN) {
+			ArrayUtils.reverse(bytes);
+		}
+		return new BigInteger(1, bytes).longValueExact();
+	}
+
+	public static ByteBuffer createSubBuffer(ByteBuffer source, int length) {
+		ByteBuffer headerData = source.slice();
+		headerData.limit(length);
+		headerData.order(source.order());
+
+		source.position(source.position() + length);
+		return headerData;
+	}
+
+	private static final BigInteger TWO = BigInteger.valueOf(2);

Review comment:
       Declare variables in the top of the class
   
   Also BigInteger ... of two? also later on you use BigInteger of Zero ?? Is int or long not big enough

##########
File path: src/main/java/org/apache/sysds/runtime/io/hdf5/H5ContiguousDataset.java
##########
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+package org.apache.sysds.runtime.io.hdf5;
+
+import org.apache.sysds.runtime.io.hdf5.message.H5DataLayoutMessage;
+import org.apache.sysds.runtime.io.hdf5.message.H5DataSpaceMessage;
+import org.apache.sysds.runtime.io.hdf5.message.H5DataTypeMessage;
+
+import java.nio.ByteBuffer;
+
+import static java.nio.ByteOrder.LITTLE_ENDIAN;
+
+public class H5ContiguousDataset {
+
+	private final H5RootObject rootObject;
+	//private final H5ObjectHeader objectHeader;

Review comment:
       remove unused code

##########
File path: src/main/java/org/apache/sysds/runtime/io/hdf5/H5Exception.java
##########
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.io.hdf5;
+
+import java.io.IOException;
+
+public class H5Exception extends RuntimeException {

Review comment:
       Maybe extend DMLRuntimeException? then you dont need to define that many functions

##########
File path: src/main/java/org/apache/sysds/runtime/io/hdf5/H5RootObject.java
##########
@@ -0,0 +1,260 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.io.hdf5;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import static java.nio.ByteOrder.LITTLE_ENDIAN;
+
+public class H5RootObject {
+
+	protected BufferedInputStream bufferedInputStream;
+	protected BufferedOutputStream bufferedOutputStream;
+	//------------------
+	//protected FileChannel fileChannel;
+	protected H5Superblock superblock;
+	protected int rank;
+	protected long row;
+	protected long col;
+	protected int[] dimensions;
+	protected long maxRow;
+	protected long maxCol;
+	protected int[] maxSizes;
+	protected String datasetName;
+	public H5BufferBuilder bufferBuilder;
+
+	protected byte dataSpaceVersion = 1;
+	protected byte objectHeaderVersion = 1;
+	protected byte localHeapVersion = 0;
+	protected byte fillValueVersion = 2;
+	protected byte dataLayoutVersion = 3;
+	protected byte objectModificationTimeVersion = 1;
+	protected byte groupSymbolTableNodeVersion = 1;
+
+	protected byte dataLayoutClass = 1;
+
+	protected H5BufferBuilder toBuffer() {
+		H5BufferBuilder bb = new H5BufferBuilder();
+		this.toBuffer(bb);
+		return bb;
+	}
+
+	protected void toBuffer(H5BufferBuilder bb) {
+	}
+
+	public ByteBuffer readBufferFromAddress(long address, int length) {
+		ByteBuffer bb = ByteBuffer.allocate(length);
+		try {
+			byte[] b = new byte[length];
+			bufferedInputStream.reset();
+			bufferedInputStream.skip(address);
+			bufferedInputStream.read(b);
+			bb.put(b);
+		}
+		catch(IOException e) {
+			throw new H5Exception(e);
+		}
+		bb.order(LITTLE_ENDIAN);
+		bb.rewind();
+		return bb;
+	}
+
+	public ByteBuffer readBufferFromAddressNoOrder(long address, int length) {
+		ByteBuffer bb = ByteBuffer.allocate(length);
+		try {
+			byte[] b = new byte[length];
+			bufferedInputStream.mark(0);
+			//bufferedInputStream.reset();

Review comment:
       remove

##########
File path: src/main/java/org/apache/sysds/runtime/io/hdf5/H5ContiguousDataset.java
##########
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+package org.apache.sysds.runtime.io.hdf5;
+
+import org.apache.sysds.runtime.io.hdf5.message.H5DataLayoutMessage;
+import org.apache.sysds.runtime.io.hdf5.message.H5DataSpaceMessage;
+import org.apache.sysds.runtime.io.hdf5.message.H5DataTypeMessage;
+
+import java.nio.ByteBuffer;
+
+import static java.nio.ByteOrder.LITTLE_ENDIAN;
+
+public class H5ContiguousDataset {
+
+	private final H5RootObject rootObject;
+	//private final H5ObjectHeader objectHeader;
+	private final H5DataLayoutMessage dataLayoutMessage;
+	private final H5DataTypeMessage dataTypeMessage;
+	private final H5DataSpaceMessage dataSpaceMessage;
+
+	public H5ContiguousDataset(H5RootObject rootObject, H5ObjectHeader objectHeader) {
+
+		this.rootObject = rootObject;
+		//	this.objectHeader = objectHeader;
+		this.dataLayoutMessage = objectHeader.getMessageOfType(H5DataLayoutMessage.class);
+		this.dataTypeMessage = objectHeader.getMessageOfType(H5DataTypeMessage.class);
+		this.dataSpaceMessage = objectHeader.getMessageOfType(H5DataSpaceMessage.class);
+	}
+
+	public ByteBuffer getDataBuffer(int row) {
+		try {
+			long rowPos = row * rootObject.getCol() * this.dataTypeMessage.getDoubleDataType().getSize();
+			ByteBuffer data = rootObject.readBufferFromAddressNoOrder(dataLayoutMessage.getAddress() + rowPos,
+				(int) (rootObject.getCol() * this.dataTypeMessage.getDoubleDataType().getSize()));
+			//data.order(LITTLE_ENDIAN);
+
+			return data;
+		}
+		catch(Exception e) {
+			e.printStackTrace();
+			throw new H5Exception("Failed to map data buffer for dataset", e);
+		}
+	}
+
+	public ByteBuffer getDataBuffer() {
+		try {
+			ByteBuffer data = rootObject.readBufferFromAddress(dataLayoutMessage.getAddress(),
+				(int) (this.dataSpaceMessage.getTotalLength() * this.dataTypeMessage.getDoubleDataType().getSize()));
+			data.order(LITTLE_ENDIAN);

Review comment:
       is this used here , but not in the other method? Maybe a test is missing?

##########
File path: src/main/java/org/apache/sysds/runtime/io/hdf5/message/H5Message.java
##########
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.io.hdf5.message;
+
+import org.apache.sysds.runtime.io.hdf5.*;
+
+import java.nio.ByteBuffer;
+import java.util.BitSet;
+
+public class H5Message {
+
+	private final BitSet flags;
+
+	protected H5RootObject rootObject;
+
+	public H5Message(H5RootObject rootObject, BitSet flags) {
+		this.rootObject = rootObject;
+		this.flags = flags;
+	}
+
+	public static H5Message readObjectHeaderMessage(H5RootObject rootObject, ByteBuffer bb) {
+		Utils.seekBufferToNextMultipleOfEight(bb);
+		int messageType = Utils.readBytesAsUnsignedInt(bb, 2);
+		int dataSize = Utils.readBytesAsUnsignedInt(bb, 2);
+		BitSet flags = BitSet.valueOf(new byte[] {bb.get()});
+
+		// Skip 3 reserved zero bytes
+		bb.position(bb.position() + 3);
+
+		// Create a new buffer holding this header data
+		final ByteBuffer headerData = Utils.createSubBuffer(bb, dataSize);
+		final H5Message message = readMessage(rootObject, headerData, messageType, flags);
+		return message;
+	}
+
+	protected H5BufferBuilder toBuffer() {
+		H5BufferBuilder bb = new H5BufferBuilder();
+		this.toBuffer(bb);
+		return bb;
+	}
+
+	public void toBuffer(H5BufferBuilder bb) {

Review comment:
       should this method do nothing?

##########
File path: src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test.java
##########
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.io.hdf5;
+
+import org.apache.sysds.api.DMLScript;
+import org.apache.sysds.common.Types.ExecMode;
+import org.apache.sysds.conf.CompilerConfig;
+import org.apache.sysds.runtime.matrix.data.MatrixValue;
+import org.apache.sysds.test.TestConfiguration;
+import org.junit.Test;
+
+import java.util.HashMap;
+
+public abstract class ReadHDF5Test extends ReadHDF5TestBase {
+
+	protected abstract int getId();
+
+	protected String getInputHDF5FileName() {
+		return "transfusion_" + getId() + ".h5";
+	}
+
+	private final static double eps = 1e-9;
+
+	@Test public void testHDF51_Seq_CP() {
+		runReadHDF5Test(getId(), ExecMode.SINGLE_NODE, false);
+	}
+
+	@Test public void testHDF51_Parallel_CP() {
+		runReadHDF5Test(getId(), ExecMode.SINGLE_NODE, true);
+	}
+
+	protected void runReadHDF5Test(int testNumber, ExecMode platform, boolean parallel) {
+
+		ExecMode oldPlatform = rtplatform;
+		rtplatform = platform;
+
+		boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
+		if(rtplatform == ExecMode.SPARK)
+			DMLScript.USE_LOCAL_SPARK_CONFIG = true;
+
+		boolean oldpar = CompilerConfig.FLAG_PARREADWRITE_TEXT;
+
+		try {
+
+			CompilerConfig.FLAG_PARREADWRITE_TEXT = parallel;
+
+			TestConfiguration config = getTestConfiguration(getTestName());
+			loadTestConfiguration(config);
+
+			String HOME = SCRIPT_DIR + TEST_DIR;
+			String inputMatrixName = HOME + INPUT_DIR + getInputHDF5FileName();
+			String dmlOutput = output("dml.scalar");
+
+			String datasetName = "DATASET_1";
+
+			//TODO: add RScript for verify file format
+
+			fullDMLScriptName = HOME + getTestName() + "_" + testNumber + ".dml";
+			programArgs = new String[] {"-args", inputMatrixName, datasetName, output("Y")};
+
+			runTest(true, false, null, -1);
+
+			HashMap<MatrixValue.CellIndex, Double> YSYSTEMDS = readDMLMatrixFromOutputDir("Y");

Review comment:
       Missing verification of read values.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org