You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@reef.apache.org by dh...@apache.org on 2015/10/27 21:55:06 UTC

incubator-reef git commit: [REEF-873] fix FileSystemPartitionInputDataSet id issue

Repository: incubator-reef
Updated Branches:
  refs/heads/master 5bb6ca55c -> db0462047


[REEF-873] fix FileSystemPartitionInputDataSet id issue

This PR is to change the way to generate id for FileSystemPartitionInputDataSet. Original way was trying to derive the id from the input file name so that to make it easy to recognize. Given the fact that the input file name itself is can be random number, and the long file name may cause errors in parsing it, we would like to choose a simpler way to generate the id.

JIRA:
[REEF-873](https://issues.apache.org/jira/browse/REEF-873)

Pull Request:
  Closes #592


Project: http://git-wip-us.apache.org/repos/asf/incubator-reef/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-reef/commit/db046204
Tree: http://git-wip-us.apache.org/repos/asf/incubator-reef/tree/db046204
Diff: http://git-wip-us.apache.org/repos/asf/incubator-reef/diff/db046204

Branch: refs/heads/master
Commit: db046204731ca1f02d9b51e64b997007734a1260
Parents: 5bb6ca5
Author: Julia Wang <ju...@microsoft.com>
Authored: Mon Oct 26 14:57:59 2015 -0700
Committer: dhruv <dh...@apache.org>
Committed: Tue Oct 27 13:52:16 2015 -0700

----------------------------------------------------------------------
 .../TestFilePartitionInputDataSet.cs            | 16 +++++++++++
 .../FileSystemPartitionInputDataSet.cs          | 29 ++++++++++++++------
 2 files changed, 37 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-reef/blob/db046204/lang/cs/Org.Apache.REEF.IO.Tests/TestFilePartitionInputDataSet.cs
----------------------------------------------------------------------
diff --git a/lang/cs/Org.Apache.REEF.IO.Tests/TestFilePartitionInputDataSet.cs b/lang/cs/Org.Apache.REEF.IO.Tests/TestFilePartitionInputDataSet.cs
index a231788..2162719 100644
--- a/lang/cs/Org.Apache.REEF.IO.Tests/TestFilePartitionInputDataSet.cs
+++ b/lang/cs/Org.Apache.REEF.IO.Tests/TestFilePartitionInputDataSet.cs
@@ -46,6 +46,22 @@ namespace Org.Apache.REEF.IO.Tests
         string sourceFilePath1 = Path.Combine(Path.GetTempPath(), tempFileName1);
         string sourceFilePath2 = Path.Combine(Path.GetTempPath(), tempFileName2);
 
+        [TestMethod]
+        public void TestDataSetId()
+        {
+            string filePaths = string.Format(CultureInfo.CurrentCulture, "{0};{1};{2};{3}", "/tmp/abc", "tmp//cde.txt", "efg", "tmp\\hhh");
+
+            var dataSet = TangFactory.GetTang()
+                .NewInjector(FileSystemInputPartitionConfiguration<IEnumerable<byte>>.ConfigurationModule
+                    .Set(FileSystemInputPartitionConfiguration<IEnumerable<byte>>.FilePathForPartitions, filePaths)
+                    .Set(FileSystemInputPartitionConfiguration<IEnumerable<byte>>.FileSerializerConfig,
+                        GetByteSerializerConfigString())
+                    .Build())
+                .GetInstance<IPartitionedInputDataSet>();
+
+            Assert.AreEqual(dataSet.Id, "FileSystemDataSet-hhh");
+        }
+
         /// <remarks>
         /// This test creates IPartitionDataSet with FileSystemInputPartitionConfiguration module.
         /// It then instantiates each IInputPartition using the IConfiguration provided by the IPartitionDescriptor.

http://git-wip-us.apache.org/repos/asf/incubator-reef/blob/db046204/lang/cs/Org.Apache.REEF.IO/PartitionedData/FileSystem/FileSystemPartitionInputDataSet.cs
----------------------------------------------------------------------
diff --git a/lang/cs/Org.Apache.REEF.IO/PartitionedData/FileSystem/FileSystemPartitionInputDataSet.cs b/lang/cs/Org.Apache.REEF.IO/PartitionedData/FileSystem/FileSystemPartitionInputDataSet.cs
index f1103e3..0abfb9a 100644
--- a/lang/cs/Org.Apache.REEF.IO/PartitionedData/FileSystem/FileSystemPartitionInputDataSet.cs
+++ b/lang/cs/Org.Apache.REEF.IO/PartitionedData/FileSystem/FileSystemPartitionInputDataSet.cs
@@ -20,6 +20,7 @@
 using System;
 using System.Collections.Generic;
 using System.Collections;
+using System.Globalization;
 using System.IO;
 using System.Linq;
 using Org.Apache.REEF.IO.FileSystem;
@@ -29,6 +30,7 @@ using Org.Apache.REEF.Tang.Formats;
 using Org.Apache.REEF.Tang.Implementations.Configuration;
 using Org.Apache.REEF.Tang.Implementations.Tang;
 using Org.Apache.REEF.Tang.Interface;
+using Org.Apache.REEF.Utilities.Logging;
 
 namespace Org.Apache.REEF.IO.PartitionedData.FileSystem
 {
@@ -40,6 +42,7 @@ namespace Org.Apache.REEF.IO.PartitionedData.FileSystem
     /// <typeparam name="T"></typeparam>
     internal sealed class FileSystemPartitionInputDataSet<T> : IPartitionedInputDataSet
     {
+        private static readonly Logger Logger = Logger.GetLogger(typeof(FileSystemPartitionInputDataSet<T>));
         private readonly Dictionary<string, IPartitionDescriptor> _partitions;
         private readonly int _count ;
         private const string StringSeparators = ";";
@@ -56,6 +59,7 @@ namespace Org.Apache.REEF.IO.PartitionedData.FileSystem
         {
             _count = filePaths.Count;
             _id = FormId(filePaths);
+
             _partitions = new Dictionary<string, IPartitionDescriptor>(_count);
 
             var fileSerializerConfig = 
@@ -116,22 +120,31 @@ namespace Org.Apache.REEF.IO.PartitionedData.FileSystem
             return _partitions.Values.GetEnumerator();
         }
 
-        private string FormId(ISet<string> filePaths)
+        /// <summary>
+        /// The id is derived form the a input file name. For whatever reason if it doesn't work out, default will be used.
+        /// </summary>
+        /// <param name="filePaths"></param>
+        /// <returns></returns>
+        private static string FormId(ISet<string> filePaths)
         {
             string id = "";
-            if (filePaths != null && filePaths.Count > 0)
+            try
             {
-                var path = filePaths.First();
-                var paths = path.Split(new string[] {StringSeparators}, StringSplitOptions.None);
-                if (paths.Length > 0)
+                if (filePaths != null && filePaths.Count > 0)
                 {
-                    FileInfo fInfo = new FileInfo(paths[0]);
-                    if (fInfo.Directory != null)
+                    var path = filePaths.First();
+                    var paths = path.Split(new string[] {"/", "//", "\\"}, StringSplitOptions.None);
+                    if (paths.Length > 0)
                     {
-                        id = fInfo.Directory.Name;
+                        id = paths[paths.Length - 1];
                     }
                 }
             }
+            catch (Exception e)
+            {
+                Logger.Log(Level.Warning, string.Format(CultureInfo.CurrentCulture, "The filePaths cannot be parsed for generating dataset id", e));
+            }
+
             return IdPrefix + id;
         }
     }