You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by kh...@apache.org on 2020/05/12 01:42:32 UTC
[madlib-site] 01/02: Image loader: Add support for loading ARRAY labels

This is an automated email from the ASF dual-hosted git repository.

khannaekta pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/madlib-site.git

commit a786c3dca039c39a57aca467a28a51901fc32c9c
Author: Ekta Khanna <ek...@pivotal.io>
AuthorDate: Tue May 5 13:38:51 2020 -0700

    Image loader: Add support for loading ARRAY labels
    
    Prior to this commit, the image_loader assumed that the labels can only
    be passed in as single values, instead of arrays. This does not hold
    true for certain networks.
---
 .../Deep-learning/madlib_image_loader.py           | 64 +++++++++++++++++-----
 1 file changed, 49 insertions(+), 15 deletions(-)

diff --git a/community-artifacts/Deep-learning/madlib_image_loader.py b/community-artifacts/Deep-learning/madlib_image_loader.py
index 1dc45b3..f214c52 100755
--- a/community-artifacts/Deep-learning/madlib_image_loader.py
+++ b/community-artifacts/Deep-learning/madlib_image_loader.py
@@ -6,10 +6,11 @@
 # The format of the image tables created will have at least 3 columns:
 #     (id SERIAL, x REAL[], y).  Each row is 1 image,
 #     with image data represented by x (a 3D array of type "real"), and
-#     y (category) as text.  id is just a unique identifier for each image,
-#     so they don't get mixed up during prediction.  If images are being
-#     loaded from disk, there will be an additional img_name column containing
-#     the filename of the image, to help identify later.
+#     y (category) as text or 3D array of numeric type(int[], real[], etc).
+#     id is just a unique identifier for each image, so they don't get
+#     mixed up during prediction.  If images are being loaded from disk,
+#     there will be an additional img_name column containing the filename
+#     of the image, to help identify later.
 #
 #   ImageLoader.ROWS_PER_FILE = 1000 by default; this is the number of rows per
 #      temporary file (or StringIO buffer) loaded at once.
@@ -54,7 +55,7 @@
 #     2a. Perform parallel image loading from numpy arrays:
 #
 #           iloader.load_dataset_from_np(data_x, data_y, table_name,
-#                                        append=False)
+#                                        append=False, label_datatype='TEXT')
 #
 #       data_x contains image data in np.array format, and data_y is a 1D np.array
 #           of the image categories (labels).
@@ -73,12 +74,15 @@
 #           name instead.  This avoids needing to pass the table_name again every
 #           time, but also allows it to be changed at any time.
 #
+#       label_datatype is used for defining the datatype for y(label) in the output
+#           table, where y is a numeric array. Default datatype for y is TEXT
+#
 #  or,
 #
 #     2b. Perform parallel image loading from disk:
 #
 #           load_dataset_from_disk(self, root_dir, table_name, num_labels='all',
-#               append=False):
+#               append=False, label_datatype='TEXT'):
 #
 #       Calling this function instead will look in root_dir on the local disk of
 #           wherever this is being run.  It will skip over any files in that
@@ -94,6 +98,9 @@
 #           have hundreds of labels, but only wish to use a subset of that
 #           containing a few dozen.
 #
+#       label_datatype is used for defining the datatype for y(label) in the output
+#           table, where y is a numeric array. Default datatype for y is TEXT
+#
 #
 # If you want to load an image dataset from disk, but don't feel like writing
 #  any python code to call the API, you can just run this file directly, passing
@@ -119,6 +126,10 @@
 #                         (default: madlib)
 #   -a, --append          Name of database where images should be loaded
 #                         (default: False)
+#   -l LABEL_DATATYPE, --label-datatype LABEL_DATATYPE
+#                         SQL datatype of label column in output table for
+#                         numeric arrays
+#                         Example: INT, REAL, BIGINT (default: TEXT)
 #   -w NUM_WORKERS, --num-workers NUM_WORKERS
 #                         Name of parallel workers. (default: 5)
 #   -p PORT, --port PORT  database server port (default: 5432)
@@ -322,9 +333,13 @@ class ImageLoader:
         for i, row in enumerate(data):
             if len(row) == 3:
                 x, y, image_name = row
+                if not self.from_disk and y.ndim > 1:
+                    y = f(y)
                 yield '{0}|{1}|{2}\n'.format(f(x), y, image_name)
             elif len(row) == 2:
                 x, y = row
+                if not self.from_disk and y.ndim > 1:
+                    y = f(y)
                 yield '{0}|{1}\n'.format(f(x), y)
             else:
                 raise RuntimeError("Cannot write invalid row to table:\n{0}"\
@@ -407,14 +422,15 @@ class ImageLoader:
             print "Appending to table {0} in {1} db".format(self.table_name,
                                                             self.db_creds.db_name)
         else:
+            y_type = self.label_datatype
             # Create new table
             try:
                 if self.from_disk:
-                    sql = "CREATE TABLE {0} (id SERIAL, x REAL[], y TEXT,\
-                        img_name TEXT)".format(self.table_name)
+                    sql = "CREATE TABLE {0} (id SERIAL, x REAL[], y {1},\
+                        img_name TEXT)".format(self.table_name, y_type)
                 else:
-                    sql = "CREATE TABLE {0} (id SERIAL, x REAL[], y TEXT)"\
-                        .format( self.table_name)
+                    sql = "CREATE TABLE {0} (id SERIAL, x REAL[], y {1})"\
+                        .format( self.table_name, y_type)
                 self.db_exec(sql)
             except db.DatabaseError as e:
                 raise RuntimeError("{0} while creating {1} in db {2}.\n"
@@ -429,7 +445,7 @@ class ImageLoader:
         self.db_close()
 
     def load_dataset_from_np(self, data_x, data_y, table_name=None,
-                             append=False):
+                             append=False, label_datatype='TEXT'):
         """
         Loads a numpy array into db.  For append=False, creates a new table and
             loads the data.  For append=True, appends data to existing table.
@@ -444,11 +460,14 @@ class ImageLoader:
         @table_name Name of table in db to load data into
         @append Whether to create a new table (False) or append to an existing
             one (True).  If unspecified, default is False
+        @label_datatype: If set will create table with the the column 'y' set
+            to the datatype specified. Default is set to TEXT
         """
         start_time = time.time()
         self.mother = True
         self.from_disk = False
         self.append = append
+        self.label_datatype = label_datatype
 
         if table_name:
             self.table_name = table_name
@@ -457,9 +476,16 @@ class ImageLoader:
             raise ValueError("Must specify table_name either in ImageLoader"
                 " constructor or in load_dataset_from_np params!")
 
+        # Flatten labels only for arrays with shape (n,1) o (1,n) since these
+        # shapes can be treated as individual labels
+        if data_y.ndim == 2 and (data_y.shape[0] == 1 or data_y.shape[1] == 1):
+            data_y = data_y.flatten()
+        else:
+            self.label_datatype = self.label_datatype + '[]'
+
+
         self._validate_input_and_create_table(data_x, data_y)
 
-        data_y = data_y.flatten()
         data = zip(data_x, data_y)
 
         if not self.pool:
@@ -531,7 +557,7 @@ class ImageLoader:
             _call_np_worker(data)
 
     def load_dataset_from_disk(self, root_dir, table_name, num_labels='all',
-                               append=False):
+                               append=False, label_datatype='TEXT'):
         """
         Load images from disk into a greenplum database table. All the images
             should be of the same shape.
@@ -545,12 +571,15 @@ class ImageLoader:
             which images will be loaded.
         @append: If set to true, do not create a new table but append to an
             existing table.
+        @label_datatype: If set will create table with the the column 'y' set
+            to the datatype specified. Default is set to TEXT
         """
         start_time = time.time()
         self.mother = True
         self.append = append
         self.no_temp_files = False
         self.table_name = table_name
+        self.label_datatype = label_datatype
         self.from_disk = True
         self._validate_input_and_create_table()
 
@@ -619,7 +648,11 @@ def main():
 
     parser.add_argument('-a', '--append', action='store_true',
                         dest='append', default=False,
-                        help='Name of database where images should be loaded')
+                        help='Insert into existing table or Create new table')
+
+    parser.add_argument('-l', '--label-datatype', action='store',
+                        dest='label_datatype', default='TEXT',
+                        help='SQL datatype(INT, REAL, BIGINT) of label column in output table')
 
     parser.add_argument('-w', '--num-workers', action='store',
                         dest='num_workers', default=5,
@@ -660,7 +693,8 @@ def main():
     iloader.load_dataset_from_disk(args.root_dir,
                                    args.table_name,
                                    args.num_labels,
-                                   args.append)
+                                   args.append,
+                                   args.label_datatype)
 
 if __name__ == '__main__':
     main()