You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2019/03/25 18:24:15 UTC
[GitHub] [incubator-mxnet] nswamy commented on a change in pull request #14503: API to create RecordIO files

nswamy commented on a change in pull request #14503: API to create RecordIO files
URL: https://github.com/apache/incubator-mxnet/pull/14503#discussion_r268789368
 
 

 ##########
 File path: python/mxnet/io/io.py
 ##########
 @@ -966,6 +985,165 @@ def creator(*args, **kwargs):
     creator.__doc__ = doc_str
     return creator
 
+
+def _read_list(list_file, batch_size):
+    """
+    Helper function that reads the .lst file, binds it in
+    a generator and returns a batched version of the generator.
+    Parameters
+    ----------
+    list_file: input list file.
+    batch_size: batch size of the generator
+    Returns
+    -------
+    item iterator that contains information in .lst file
+    """
+    def get_generator():
+        with open(list_file) as fin:
+            while True:
+                line = fin.readline()
+                if not line:
+                    break
+                line = [i.strip() for i in line.strip().split('\t')]
+                line_len = len(line)
+                # check the data format of .lst file
+                if line_len < 3:
+                    logging.info("lst should have at least has three parts, "
+                                + "but only has {} parts for {}".format(line_len, line))
+                    continue
+                try:
+                    item = [int(line[0])] + [line[-1]] + [float(i) for i in line[1:-1]]
+                except Exception as e:
+                    logging.info('Parsing lst met error for {}, detail: {}'.format(line, e))
+                    continue
+                yield item
+    data_iter = iter(get_generator())
+    data_batch = list(itertools.islice(data_iter, batch_size))
+    while data_batch:
+        yield data_batch
+        data_batch = list(itertools.islice(data_iter, batch_size))
+
+def _read_worker(q_out, transforms, color, quality, encoding, data_record):
+    """
+    Helper function that will be run by the read workers
+    to fetch the image from the input queue apply
+    transformations and put it into output priority queue.
+    Parameters
+    ----------
+    args: object
+    q_out: queue
+    color: color
+    quality: quality
+    encoding: encoding
+    deq: image instance to work on.
+    """
+    i, item = data_record
+    fullpath = os.path.join(args.root, item[1])
+    try:
+        # construct the header of the record
+        if len(item) > 3:
+            header = recordio.IRHeader(0, item[2:], item[0], 0)
+        else:
+            header = recordio.IRHeader(0, item[2], item[0], 0)
+
+        img = cv2.imread(fullpath, args.color)
+        if img is None:
+            logging.info('imread read blank (None) image for file: %s' % fullpath)
+            return
+        img = transforms(img)
+        s = recordio.pack_img(header, img, quality=args.quality, img_fmt=args.encoding)
+        q_out.put((i, s, item))
+    except Exception as e:
+        logging.info('pack_img error on file: %s' % fullpath, e)
+        return
+
+def _validate_filenames(list_file, output_path):
+    """
+    Helper function to validate the file paths of
+    the input list file and output .rec file path.
+    Parameters
+    --------
+    list_file: input list file path
+    output_path: path to the output directory
+    """
+    if not os.path.isfile(list_file):
+        raise Exception("Input list file is invalid - \
+            1. Wrong filename or file path \n2. List file should be of format *.lst")
+    if not os.path.isdir(output_path):
+        raise Exception("Output path should be a directory where the \
+            rec files will be stored.")
+
+def _count_elem(iter):
+    """
+    Helper function to count the number of elements in
+    a generator.
+    Parameters
+    -----
+    iter: generator object
+    Returns
+    -----
+    count: total count of elements
+    """    
+    cnt = itertools.count()
+    deque(zip(iter, cnt), 0)
+    return next(cnt)
+
+def im2rec(list_file, transforms, dataset_params, output_path):
 
 Review comment:
   `dataset_params` -> I think we should expand this

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services