You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by du...@apache.org on 2017/05/22 23:41:50 UTC
incubator-systemml git commit: [SYSTEMML-1465] Add stain normalization to breast cancer preprocessing.

Repository: incubator-systemml
Updated Branches:
  refs/heads/master 3d1f77ce2 -> e93fdd35b


[SYSTEMML-1465] Add stain normalization to breast cancer preprocessing.

Adding stain normalization of H&E histology slides for the breast cancer
project.

Closes #507.


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/e93fdd35
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/e93fdd35
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/e93fdd35

Branch: refs/heads/master
Commit: e93fdd35b82e95fd41023850939593d5d37c8512
Parents: 3d1f77c
Author: Mike Dusenberry <mw...@us.ibm.com>
Authored: Mon May 22 16:39:09 2017 -0700
Committer: Mike Dusenberry <mw...@us.ibm.com>
Committed: Mon May 22 16:39:09 2017 -0700

----------------------------------------------------------------------
 .../breast_cancer/breastcancer/preprocessing.py | 261 ++++++++++++++-----
 1 file changed, 200 insertions(+), 61 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e93fdd35/projects/breast_cancer/breastcancer/preprocessing.py
----------------------------------------------------------------------
diff --git a/projects/breast_cancer/breastcancer/preprocessing.py b/projects/breast_cancer/breastcancer/preprocessing.py
index 424d699..cfedf94 100644
--- a/projects/breast_cancer/breastcancer/preprocessing.py
+++ b/projects/breast_cancer/breastcancer/preprocessing.py
@@ -47,7 +47,7 @@ from skimage.morphology import binary_closing, binary_dilation, disk
 def open_slide(slide_num, folder, training):
   """
   Open a whole-slide image, given an image number.
-  
+
   Args:
     slide_num: Slide image number as an integer.
     folder: Directory in which the slides folder is stored, as a string.
@@ -55,7 +55,7 @@ def open_slide(slide_num, folder, training):
       images in the format `TUPAC-TR-###.svs`, or a `testing_image_data`
       folder with images in the format `TUPAC-TE-###.svs`.
     training: Boolean for training or testing datasets.
-  
+
   Returns:
     An OpenSlide object representing a whole-slide image.
   """
@@ -75,15 +75,15 @@ def open_slide(slide_num, folder, training):
 def create_tile_generator(slide, tile_size, overlap):
   """
   Create a tile generator for the given slide.
-  
+
   This generator is able to extract tiles from the overall
   whole-slide image.
-  
+
   Args:
     slide: An OpenSlide object representing a whole-slide image.
     tile_size: The width and height of a square tile to be generated.
     overlap: Number of pixels by which to overlap the tiles.
-  
+
   Returns:
     A DeepZoomGenerator object representing the tile generator. Each
     extracted tile is a PIL Image with shape
@@ -100,18 +100,18 @@ def create_tile_generator(slide, tile_size, overlap):
 def get_20x_zoom_level(slide, generator):
   """
   Return the zoom level that corresponds to a 20x magnification.
-  
+
   The generator can extract tiles from multiple zoom levels,
   downsampling by a factor of 2 per level from highest to lowest
   resolution.
-  
+
   Args:
     slide: An OpenSlide object representing a whole-slide image.
     generator: A DeepZoomGenerator object representing a tile generator.
       Note: This generator is not a true "Python generator function",
       but rather is an object that is capable of extracting individual
       tiles.
-  
+
   Returns:
     Zoom level corresponding to a 20x magnification, or as close as
     possible.
@@ -138,11 +138,11 @@ def get_20x_zoom_level(slide, generator):
 def process_slide(slide_num, folder, training, tile_size, overlap):
   """
   Generate all possible tile indices for a whole-slide image.
-  
+
   Given a slide number, tile size, and overlap, generate
   all possible (slide_num, tile_size, overlap, zoom_level, col, row)
   indices.
-  
+
   Args:
     slide_num: Slide image number as an integer.
     folder: Directory in which the slides folder is stored, as a string.
@@ -152,7 +152,7 @@ def process_slide(slide_num, folder, training, tile_size, overlap):
     training: Boolean for training or testing datasets.
     tile_size: The width and height of a square tile to be generated.
     overlap: Number of pixels by which to overlap the tiles.
-  
+
   Returns:
     A list of (slide_num, tile_size, overlap, zoom_level, col, row)
     integer index tuples representing possible tiles to extract.
@@ -175,10 +175,10 @@ def process_slide(slide_num, folder, training, tile_size, overlap):
 def process_tile_index(tile_index, folder, training):
   """
   Generate a tile from a tile index.
-  
+
   Given a (slide_num, tile_size, overlap, zoom_level, col, row) tile
   index, generate a (slide_num, tile) tuple.
-  
+
   Args:
     tile_index: A (slide_num, tile_size, overlap, zoom_level, col, row)
       integer index tuple representing a tile to extract.
@@ -187,7 +187,7 @@ def process_tile_index(tile_index, folder, training):
       images in the format `TUPAC-TR-###.svs`, or a `testing_image_data`
       folder with images in the format `TUPAC-TE-###.svs`.
     training: Boolean for training or testing datasets.
-  
+
   Returns:
     A (slide_num, tile) tuple, where slide_num is an integer, and tile
     is a 3D NumPy array of shape (tile_size, tile_size, channels) in
@@ -199,7 +199,7 @@ def process_tile_index(tile_index, folder, training):
   # Create tile generator.
   generator = create_tile_generator(slide, tile_size, overlap)
   # Generate tile.
-  tile = np.array(generator.get_tile(zoom_level, (col, row)))
+  tile = np.asarray(generator.get_tile(zoom_level, (col, row)))
   return (slide_num, tile)
 
 
@@ -208,10 +208,10 @@ def process_tile_index(tile_index, folder, training):
 def optical_density(tile):
   """
   Convert a tile to optical density values.
-  
+
   Args:
     tile: A 3D NumPy array of shape (tile_size, tile_size, channels).
-  
+
   Returns:
     A 3D NumPy array of shape (tile_size, tile_size, channels)
     representing optical density values.
@@ -225,20 +225,20 @@ def optical_density(tile):
 def keep_tile(tile_tuple, tile_size, tissue_threshold):
   """
   Determine if a tile should be kept.
-  
+
   This filters out tiles based on size and a tissue percentage
   threshold, using a custom algorithm. If a tile has height &
   width equal to (tile_size, tile_size), and contains greater
   than or equal to the given percentage, then it will be kept;
   otherwise it will be filtered out.
-  
+
   Args:
     tile_tuple: A (slide_num, tile) tuple, where slide_num is an
-      integer, and tile is a 3D NumPy array of shape 
+      integer, and tile is a 3D NumPy array of shape
       (tile_size, tile_size, channels) in RGB format.
     tile_size: The width and height of a square tile to be generated.
     tissue_threshold: Tissue percentage threshold.
-  
+
   Returns:
     A Boolean indicating whether or not a tile should be kept for
     future usage.
@@ -246,7 +246,7 @@ def keep_tile(tile_tuple, tile_size, tissue_threshold):
   slide_num, tile = tile_tuple
   if tile.shape[0:2] == (tile_size, tile_size):
     tile_orig = tile
-    
+
     # Check 1
     # Convert 3D RGB image to 2D grayscale image, from
     # 0 (dense tissue) to 1 (plain background).
@@ -272,7 +272,7 @@ def keep_tile(tile_tuple, tile_size, tissue_threshold):
     # Calculate percentage of tissue coverage.
     percentage = tile.mean()
     check1 = percentage >= tissue_threshold
-    
+
     # Check 2
     # Convert to optical density values
     tile = optical_density(tile_orig)
@@ -285,39 +285,36 @@ def keep_tile(tile_tuple, tile_size, tissue_threshold):
     tile = binary_fill_holes(tile)
     percentage = tile.mean()
     check2 = percentage >= tissue_threshold
-    
+
     return check1 and check2
   else:
     return False
 
 
-# Generate Flattened Samples From Tile
+# Generate Samples From Tile
 
 def process_tile(tile_tuple, sample_size, grayscale):
   """
   Process a tile into a group of smaller samples.
-  
+
   Cut up a tile into smaller blocks of sample_size x sample_size pixels,
-  change the shape of each sample from (H, W, channels) to 
+  change the shape of each sample from (H, W, channels) to
   (channels, H, W), then flatten each into a vector of length
   channels*H*W.
-  
+
   Args:
     tile_tuple: A (slide_num, tile) tuple, where slide_num is an
-      integer, and tile is a 3D NumPy array of shape 
+      integer, and tile is a 3D NumPy array of shape
       (tile_size, tile_size, channels).
     sample_size: The new width and height of the square samples to be
       generated.
     grayscale: Whether or not to generate grayscale samples, rather
       than RGB.
-  
+
   Returns:
     A list of (slide_num, sample) tuples representing cut up tiles,
-    where each sample has been transposed from
-    (sample_size_x, sample_size_y, channels) to
-    (channels, sample_size_x, sample_size_y),
-    and flattened to a vector of length
-    (channels*sample_size_x*sample_size_y).
+    where each sample is a 3D NumPy array of shape
+    (sample_size_x, sample_size_y, channels).
   """
   slide_num, tile = tile_tuple
   if grayscale:
@@ -332,24 +329,166 @@ def process_tile(tile_tuple, sample_size, grayscale):
   # (num_x, num_y, sample_size_x, sample_size_y, ch).
   # 3. Combine num_x and num_y into single axis, returning
   # (num_samples, sample_size_x, sample_size_y, ch).
-  # 4. Swap axes from (num_samples, sample_size_x, sample_size_y, ch) to
-  # (num_samples, ch, sample_size_x, sample_size_y).
-  # 5. Flatten samples into (num_samples, ch*sample_size_x*sample_size_y).
   samples = (tile.reshape((x // sample_size, sample_size, y // sample_size, sample_size, ch))
                  .swapaxes(1,2)
-                 .reshape((-1, sample_size, sample_size, ch))
-                 .transpose(0,3,1,2))
-  samples = samples.reshape(samples.shape[0], -1)
+                 .reshape((-1, sample_size, sample_size, ch)))
   samples = [(slide_num, sample) for sample in list(samples)]
   return samples
 
 
+# Normalize staining
+
+def normalize_staining(sample_tuple, beta=0.15, alpha=1, light_intensity=255):
+  """
+  Normalize the staining of H&E histology slides.
+
+  This function normalizes the staining of H&E histology slides.
+
+  References:
+    - Macenko, Marc, et al. "A method for normalizing histology slides
+    for quantitative analysis." Biomedical Imaging: From Nano to Macro,
+    2009.  ISBI'09. IEEE International Symposium on. IEEE, 2009.
+      - http://wwwx.cs.unc.edu/~mn/sites/default/files/macenko2009.pdf
+    - https://github.com/mitkovetta/staining-normalization
+
+  Args:
+    sample_tuple: A (slide_num, sample) tuple, where slide_num is an
+      integer, and sample is a 3D NumPy array of shape (H,W,C).
+
+  Returns:
+    A (slide_num, sample) tuple, where the sample is a 3D NumPy array
+    of shape (H,W,C) that has been stain normalized.
+  """
+  # Setup.
+  slide_num, sample = sample_tuple
+  x = np.asarray(sample)
+  h, w, c = x.shape
+  x = x.reshape(-1, c).astype(np.float64)  # shape (H*W, C)
+
+  # Reference stain vectors and stain saturations.  We will normalize all slides
+  # to these references.  To create these, grab the stain vectors and stain
+  # saturations from a desirable slide.
+
+  # Values in reference implementation for use with eigendecomposition approach, natural log,
+  # and `light_intensity=240`.
+  #stain_ref = np.array([0.5626, 0.2159, 0.7201, 0.8012, 0.4062, 0.5581]).reshape(3,2)
+  #max_sat_ref = np.array([1.9705, 1.0308]).reshape(2,1)
+
+  # SVD w/ log10, and `light_intensity=255`.
+  stain_ref = (np.array([0.54598845, 0.322116, 0.72385198, 0.76419107, 0.42182333, 0.55879629])
+                 .reshape(3,2))
+  max_sat_ref = np.array([0.82791151, 0.61137274]).reshape(2,1)
+
+  # Convert RGB to OD.
+  # Note: The original paper used log10, and the reference implementation used the natural log.
+  #OD = -np.log((x+1)/light_intensity)  # shape (H*W, C)
+  OD = -np.log10(x/light_intensity + 1e-8)
+
+  # Remove data with OD intensity less than beta.
+  # I.e. remove transparent pixels.
+  # Note: This needs to be checked per channel, rather than
+  # taking an average over all channels for a given pixel.
+  OD_thresh = OD[np.all(OD >= beta, 1), :]  # shape (K, C)
+
+  # Calculate eigenvectors.
+  # Note: We can either use eigenvector decomposition, or SVD.
+  #eigvals, eigvecs = np.linalg.eig(np.cov(OD_thresh.T))  # np.cov results in inf/nans
+  U, s, V = np.linalg.svd(OD_thresh, full_matrices=False)
+
+  # Extract two largest eigenvectors.
+  # Note: We swap the sign of the eigvecs here to be consistent
+  # with other implementations.  Both +/- eigvecs are valid, with
+  # the same eigenvalue, so this is okay.
+  #top_eigvecs = eigvecs[:, np.argsort(eigvals)[-2:]] * -1
+  top_eigvecs = V[0:2, :].T * -1  # shape (C, 2)
+
+  # Project thresholded optical density values onto plane spanned by
+  # 2 largest eigenvectors.
+  proj = np.dot(OD_thresh, top_eigvecs)  # shape (K, 2)
+
+  # Calculate angle of each point wrt the first plane direction.
+  # Note: the parameters are `np.arctan2(y, x)`
+  angles = np.arctan2(proj[:, 1], proj[:, 0])  # shape (K,)
+
+  # Find robust extremes (a and 100-a percentiles) of the angle.
+  min_angle = np.percentile(angles, alpha)
+  max_angle = np.percentile(angles, 100-alpha)
+
+  # Convert min/max vectors (extremes) back to optimal stains in OD space.
+  # This computes a set of axes for each angle onto which we can project
+  # the top eigenvectors.  This assumes that the projected values have
+  # been normalized to unit length.
+  extreme_angles = np.array(
+    [[np.cos(min_angle), np.cos(max_angle)],
+     [np.sin(min_angle), np.sin(max_angle)]]
+  )  # shape (2,2)
+  stains = np.dot(top_eigvecs, extreme_angles)  # shape (C, 2)
+
+  # Merge vectors with hematoxylin first, and eosin second, as a heuristic.
+  if stains[0, 0] < stains[0, 1]:
+    stains[:, [0, 1]] = stains[:, [1, 0]]  # swap columns
+
+  # Calculate saturations of each stain.
+  # Note: Here, we solve
+  #    OD = VS
+  #     S = V^{-1}OD
+  # where `OD` is the matrix of optical density values of our image,
+  # `V` is the matrix of stain vectors, and `S` is the matrix of stain
+  # saturations.  Since this is an overdetermined system, we use the
+  # least squares solver, rather than a direct solve.
+  sats, _, _, _ = np.linalg.lstsq(stains, OD.T)
+
+  # Normalize stain saturations to have same pseudo-maximum based on
+  # a reference max saturation.
+  max_sat = np.percentile(sats, 99, axis=1, keepdims=True)
+  sats = sats / max_sat * max_sat_ref
+
+  # Compute optimal OD values.
+  OD_norm = np.dot(stain_ref, sats)
+
+  # Recreate image.
+  # Note: If the image is immediately converted to uint8 with `.astype(np.uint8)`, it will
+  # not return the correct values due to the initital values being outside of [0,255].
+  # To fix this, we round to the nearest integer, and then clip to [0,255], which is the
+  # same behavior as Matlab.
+  #x_norm = np.exp(OD_norm) * light_intensity  # natural log approach
+  x_norm = 10**(-OD_norm) * light_intensity - 1e-8  # log10 approach
+  x_norm = np.clip(np.round(x_norm), 0, 255).astype(np.uint8)
+  x_norm = x_norm.astype(np.uint8)
+  x_norm = x_norm.T.reshape(h,w,c)
+  return (slide_num, x_norm)
+
+
+def flatten_sample(sample_tuple):
+  """
+  Flatten a (H,W,C) sample into a (C*H*W) row vector.
+
+  Transpose each sample from (H, W, channels) to (channels, H, W), then
+  flatten each into a vector of length channels*H*W.
+
+  Args:
+    sample_tuple: A (slide_num, sample) tuple, where slide_num is an
+      integer, and sample is a 3D NumPy array of shape (H,W,C).
+
+  Returns:
+    A (slide_num, sample) tuple, where the sample has been transposed
+    from (H,W,C) to (C,H,W), and flattened to a vector of length
+    (C*H*W).
+  """
+  slide_num, sample = sample_tuple
+  # 1. Swap axes from (sample_size_x, sample_size_y, ch) to
+  # (ch, sample_size_x, sample_size_y).
+  # 2. Flatten sample into (ch*sample_size_x*sample_size_y).
+  flattened_sample = sample.transpose(2,0,1).reshape(-1)
+  return (slide_num, flattened_sample)
+
+
 # Get Ground Truth Labels
 
 def get_labels_df(folder):
   """
   Create a DataFrame with the ground truth labels for each slide.
-  
+
   Args:
     folder: Directory containing a `training_ground_truth.csv` file
       containing the ground truth "tumor_score" and "molecular_score"
@@ -369,17 +508,18 @@ def get_labels_df(folder):
 # Process All Slides Into A Spark DataFrame
 
 def preprocess(spark, slide_nums, folder="data", training=True, tile_size=1024, overlap=0,
-               tissue_threshold=0.9, sample_size=256, grayscale=False, num_partitions=20000):
+               tissue_threshold=0.9, sample_size=256, grayscale=False, normalize_stains=True,
+               num_partitions=20000):
   """
   Preprocess a set of whole-slide images.
-  
+
   Preprocess a set of whole-slide images as follows:
     1. Tile the slides into tiles of size (tile_size, tile_size, 3).
     2. Filter the tiles to remove unnecessary tissue.
     3. Cut the remaining tiles into samples of size
        (sample_size, sample_size, ch), where `ch` is 1 if `grayscale`
        is true, or 3 otherwise.
-  
+
   Args:
     spark: SparkSession.
     slide_nums: List of whole-slide numbers to process.
@@ -399,8 +539,9 @@ def preprocess(spark, slide_nums, folder="data", training=True, tile_size=1024,
       generated.
     grayscale: Whether or not to generate grayscale samples, rather
       than RGB.
+    normalize_stains: Whether or not to apply stain normalization.
     num_partitions: Number of partitions to use during processing.
-  
+
   Returns:
     A Spark DataFrame in which each row contains the slide number, tumor
     score, molecular score, and the sample stretched out into a Vector.
@@ -418,17 +559,16 @@ def preprocess(spark, slide_nums, folder="data", training=True, tile_size=1024,
   #row_mb = tile_size * tile_size * channels * 8 / 1024 / 1024  # size of one row in MB
   #rows_per_part = round(part_size / row_mb)
   #num_parts = rows / rows_per_part
-  ## HACK: Force even partitioning by collecting and parallelizing -- for memory issues.
-  ## Note: This was a PySpark bug with a fix in the master branch now.
-  #tile_indices = tile_indices.collect()
-  #tile_indices = sc.parallelize(tile_indices, num_partitions)
-  ## END HACK
   tile_indices = tile_indices.repartition(num_partitions)
   tile_indices.cache()
-  # Extract all tiles into a DataFrame, filter, and cut into smaller samples.
+  # Extract all tiles into a DataFrame, filter, cut into smaller samples, apply stain
+  # normalization, and flatten.
   tiles = tile_indices.map(lambda tile_index: process_tile_index(tile_index, folder, training))
   filtered_tiles = tiles.filter(lambda tile: keep_tile(tile, tile_size, tissue_threshold))
   samples = filtered_tiles.flatMap(lambda tile: process_tile(tile, sample_size, grayscale))
+  if normalize_stains:
+    samples = samples.map(lambda sample: normalize_staining(sample))
+  samples = samples.map(lambda sample: flatten_sample(sample))
   if training:
     # Append labels
     labels_df = get_labels_df(folder)
@@ -441,7 +581,6 @@ def preprocess(spark, slide_nums, folder="data", training=True, tile_size=1024,
   else:  # testing data -- no labels
     df = samples.toDF(["slide_num", "sample"])
     df = df.select(df.slide_num.astype("int"), df["sample"])
-  #df = df.repartition(num_partitions)  # HACK: Even out the partitions to avoid saving issues
   return df
 
 
@@ -451,7 +590,7 @@ def train_val_split(spark, df, slide_nums, folder, train_frac=0.8, add_row_indic
                     debug=False):
   """
   Split a DataFrame of slide samples into training and validation sets.
-  
+
   Args:
     spark: SparkSession.
     df: A Spark DataFrame in which each row contains the slide number,
@@ -466,7 +605,7 @@ def train_val_split(spark, df, slide_nums, folder, train_frac=0.8, add_row_indic
     add_row_indices: Boolean for whether or not to prepend an index
       column contain the row index for use downstream by SystemML.
       The column name will be "__INDEX".
-    
+
   Returns:
     A Spark DataFrame in which each row contains the slide number, tumor
     score, molecular score, and the sample stretched out into a Vector.
@@ -474,7 +613,7 @@ def train_val_split(spark, df, slide_nums, folder, train_frac=0.8, add_row_indic
   # Create DataFrame of labels for the given slide numbers.
   labels_df = get_labels_df(folder)
   labels_df = labels_df.loc[slide_nums]
-    
+
   # Randomly split slides 80%/20% into train and validation sets.
   train_nums_df = labels_df.sample(frac=train_frac, random_state=seed)
   val_nums_df = labels_df.drop(train_nums_df.index)
@@ -487,11 +626,11 @@ def train_val_split(spark, df, slide_nums, folder, train_frac=0.8, add_row_indic
                    .coalesce(1))
 
   # Note: Explicitly mark the smaller DataFrames as able to be broadcasted
-  # in order to have Catalyst choose the more efficient BroadcastHashJoin, 
+  # in order to have Catalyst choose the more efficient BroadcastHashJoin,
   # rather than the costly SortMergeJoin.
   train = df.join(F.broadcast(train_nums), on="slide_num")
   val = df.join(F.broadcast(val_nums), on="slide_num")
-  
+
   if debug:
     # DEBUG: Sanity checks.
     assert len(pd.merge(train_nums_df, val_nums_df, on="slide_num")) == 0
@@ -506,14 +645,14 @@ def train_val_split(spark, df, slide_nums, folder, train_frac=0.8, add_row_indic
     print(train.count(), val.count())
     #  - Check physical plans for broadcast join.
     print(train.explain(), val.explain())
-  
+
   # Add row indices for use with SystemML.
   if add_row_indices:
     train = (train.rdd
                   .zipWithIndex()
                   .map(lambda r: (r[1] + 1, *r[0]))  # flatten & convert index to 1-based indexing
                   .toDF(['__INDEX', 'slide_num', 'tumor_score', 'molecular_score', 'sample']))
-    train = train.select(train["__INDEX"].astype("int"), train.slide_num.astype("int"), 
+    train = train.select(train["__INDEX"].astype("int"), train.slide_num.astype("int"),
                          train.tumor_score.astype("int"), train.molecular_score, train["sample"])
 
     val = (val.rdd
@@ -531,7 +670,7 @@ def train_val_split(spark, df, slide_nums, folder, train_frac=0.8, add_row_indic
 def save(df, filepath, sample_size, grayscale, mode="error", format="parquet", file_size=128):
   """
   Save a preprocessed DataFrame with a constraint on the file sizes.
-  
+
   Args:
     df: A Spark DataFrame.
     filepath: Hadoop-supported path at which to save `df`.