You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by kh...@apache.org on 2020/05/12 01:42:32 UTC
[madlib-site] 01/02: Image loader: Add support for loading ARRAY
labels
This is an automated email from the ASF dual-hosted git repository.
khannaekta pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/madlib-site.git
commit a786c3dca039c39a57aca467a28a51901fc32c9c
Author: Ekta Khanna <ek...@pivotal.io>
AuthorDate: Tue May 5 13:38:51 2020 -0700
Image loader: Add support for loading ARRAY labels
Prior to this commit, the image_loader assumed that the labels can only
be passed in as single values, instead of arrays. This does not hold
true for certain networks.
---
.../Deep-learning/madlib_image_loader.py | 64 +++++++++++++++++-----
1 file changed, 49 insertions(+), 15 deletions(-)
diff --git a/community-artifacts/Deep-learning/madlib_image_loader.py b/community-artifacts/Deep-learning/madlib_image_loader.py
index 1dc45b3..f214c52 100755
--- a/community-artifacts/Deep-learning/madlib_image_loader.py
+++ b/community-artifacts/Deep-learning/madlib_image_loader.py
@@ -6,10 +6,11 @@
# The format of the image tables created will have at least 3 columns:
# (id SERIAL, x REAL[], y). Each row is 1 image,
# with image data represented by x (a 3D array of type "real"), and
-# y (category) as text. id is just a unique identifier for each image,
-# so they don't get mixed up during prediction. If images are being
-# loaded from disk, there will be an additional img_name column containing
-# the filename of the image, to help identify later.
+# y (category) as text or 3D array of numeric type(int[], real[], etc).
+# id is just a unique identifier for each image, so they don't get
+# mixed up during prediction. If images are being loaded from disk,
+# there will be an additional img_name column containing the filename
+# of the image, to help identify later.
#
# ImageLoader.ROWS_PER_FILE = 1000 by default; this is the number of rows per
# temporary file (or StringIO buffer) loaded at once.
@@ -54,7 +55,7 @@
# 2a. Perform parallel image loading from numpy arrays:
#
# iloader.load_dataset_from_np(data_x, data_y, table_name,
-# append=False)
+# append=False, label_datatype='TEXT')
#
# data_x contains image data in np.array format, and data_y is a 1D np.array
# of the image categories (labels).
@@ -73,12 +74,15 @@
# name instead. This avoids needing to pass the table_name again every
# time, but also allows it to be changed at any time.
#
+# label_datatype is used for defining the datatype for y(label) in the output
+# table, where y is a numeric array. Default datatype for y is TEXT
+#
# or,
#
# 2b. Perform parallel image loading from disk:
#
# load_dataset_from_disk(self, root_dir, table_name, num_labels='all',
-# append=False):
+# append=False, label_datatype='TEXT'):
#
# Calling this function instead will look in root_dir on the local disk of
# wherever this is being run. It will skip over any files in that
@@ -94,6 +98,9 @@
# have hundreds of labels, but only wish to use a subset of that
# containing a few dozen.
#
+# label_datatype is used for defining the datatype for y(label) in the output
+# table, where y is a numeric array. Default datatype for y is TEXT
+#
#
# If you want to load an image dataset from disk, but don't feel like writing
# any python code to call the API, you can just run this file directly, passing
@@ -119,6 +126,10 @@
# (default: madlib)
# -a, --append Name of database where images should be loaded
# (default: False)
+# -l LABEL_DATATYPE, --label-datatype LABEL_DATATYPE
+# SQL datatype of label column in output table for
+# numeric arrays
+# Example: INT, REAL, BIGINT (default: TEXT)
# -w NUM_WORKERS, --num-workers NUM_WORKERS
# Name of parallel workers. (default: 5)
# -p PORT, --port PORT database server port (default: 5432)
@@ -322,9 +333,13 @@ class ImageLoader:
for i, row in enumerate(data):
if len(row) == 3:
x, y, image_name = row
+ if not self.from_disk and y.ndim > 1:
+ y = f(y)
yield '{0}|{1}|{2}\n'.format(f(x), y, image_name)
elif len(row) == 2:
x, y = row
+ if not self.from_disk and y.ndim > 1:
+ y = f(y)
yield '{0}|{1}\n'.format(f(x), y)
else:
raise RuntimeError("Cannot write invalid row to table:\n{0}"\
@@ -407,14 +422,15 @@ class ImageLoader:
print "Appending to table {0} in {1} db".format(self.table_name,
self.db_creds.db_name)
else:
+ y_type = self.label_datatype
# Create new table
try:
if self.from_disk:
- sql = "CREATE TABLE {0} (id SERIAL, x REAL[], y TEXT,\
- img_name TEXT)".format(self.table_name)
+ sql = "CREATE TABLE {0} (id SERIAL, x REAL[], y {1},\
+ img_name TEXT)".format(self.table_name, y_type)
else:
- sql = "CREATE TABLE {0} (id SERIAL, x REAL[], y TEXT)"\
- .format( self.table_name)
+ sql = "CREATE TABLE {0} (id SERIAL, x REAL[], y {1})"\
+ .format( self.table_name, y_type)
self.db_exec(sql)
except db.DatabaseError as e:
raise RuntimeError("{0} while creating {1} in db {2}.\n"
@@ -429,7 +445,7 @@ class ImageLoader:
self.db_close()
def load_dataset_from_np(self, data_x, data_y, table_name=None,
- append=False):
+ append=False, label_datatype='TEXT'):
"""
Loads a numpy array into db. For append=False, creates a new table and
loads the data. For append=True, appends data to existing table.
@@ -444,11 +460,14 @@ class ImageLoader:
@table_name Name of table in db to load data into
@append Whether to create a new table (False) or append to an existing
one (True). If unspecified, default is False
+ @label_datatype: If set will create table with the the column 'y' set
+ to the datatype specified. Default is set to TEXT
"""
start_time = time.time()
self.mother = True
self.from_disk = False
self.append = append
+ self.label_datatype = label_datatype
if table_name:
self.table_name = table_name
@@ -457,9 +476,16 @@ class ImageLoader:
raise ValueError("Must specify table_name either in ImageLoader"
" constructor or in load_dataset_from_np params!")
+ # Flatten labels only for arrays with shape (n,1) o (1,n) since these
+ # shapes can be treated as individual labels
+ if data_y.ndim == 2 and (data_y.shape[0] == 1 or data_y.shape[1] == 1):
+ data_y = data_y.flatten()
+ else:
+ self.label_datatype = self.label_datatype + '[]'
+
+
self._validate_input_and_create_table(data_x, data_y)
- data_y = data_y.flatten()
data = zip(data_x, data_y)
if not self.pool:
@@ -531,7 +557,7 @@ class ImageLoader:
_call_np_worker(data)
def load_dataset_from_disk(self, root_dir, table_name, num_labels='all',
- append=False):
+ append=False, label_datatype='TEXT'):
"""
Load images from disk into a greenplum database table. All the images
should be of the same shape.
@@ -545,12 +571,15 @@ class ImageLoader:
which images will be loaded.
@append: If set to true, do not create a new table but append to an
existing table.
+ @label_datatype: If set will create table with the the column 'y' set
+ to the datatype specified. Default is set to TEXT
"""
start_time = time.time()
self.mother = True
self.append = append
self.no_temp_files = False
self.table_name = table_name
+ self.label_datatype = label_datatype
self.from_disk = True
self._validate_input_and_create_table()
@@ -619,7 +648,11 @@ def main():
parser.add_argument('-a', '--append', action='store_true',
dest='append', default=False,
- help='Name of database where images should be loaded')
+ help='Insert into existing table or Create new table')
+
+ parser.add_argument('-l', '--label-datatype', action='store',
+ dest='label_datatype', default='TEXT',
+ help='SQL datatype(INT, REAL, BIGINT) of label column in output table')
parser.add_argument('-w', '--num-workers', action='store',
dest='num_workers', default=5,
@@ -660,7 +693,8 @@ def main():
iloader.load_dataset_from_disk(args.root_dir,
args.table_name,
args.num_labels,
- args.append)
+ args.append,
+ args.label_datatype)
if __name__ == '__main__':
main()