You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@beam.apache.org by da...@apache.org on 2022/09/29 15:27:23 UTC

[beam] branch master updated: Example of Online Clustering (#23289)

This is an automated email from the ASF dual-hosted git repository.

damccorm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/beam.git


The following commit(s) were added to refs/heads/master by this push:
     new 7b8aa28e34a Example of Online Clustering  (#23289)
7b8aa28e34a is described below

commit 7b8aa28e34a70a178bd569b28c3e0fcf9d87dd6b
Author: Shubham Krishna <sh...@gmail.com>
AuthorDate: Thu Sep 29 17:27:10 2022 +0200

    Example of Online Clustering  (#23289)
    
    * Add Online Clustering Code and first draft of documentation
    
    * Update Documentation and correct code snippets
    
    * fix formatting, improve documentation
    
    * Fix formatting and linting
    
    * Fix isort errors
    
    * Fix formatting using yapf and add new lines
    
    * Remove new lines
    
    * Remove new line
    
    * Fix import order errors
    
    * Remove trailing newline
    
    Co-authored-by: Shubham Krishna <“shubham.krishna@ml6.eu”>
---
 .../clustering_pipeline/__init__.py                |  16 ++
 .../clustering_pipeline/config.py                  |  28 +++
 .../online_clustering/clustering_pipeline/main.py  | 122 ++++++++++++
 .../clustering_pipeline/pipeline/__init__.py       |  16 ++
 .../clustering_pipeline/pipeline/options.py        |  63 ++++++
 .../pipeline/transformations.py                    | 197 ++++++++++++++++++
 .../online_clustering/clustering_pipeline/setup.py |  44 +++++
 .../write_data_to_pubsub_pipeline/__init__.py      |  16 ++
 .../write_data_to_pubsub_pipeline/config.py        |  23 +++
 .../write_data_to_pubsub_pipeline/main.py          |  89 +++++++++
 .../pipeline/__init__.py                           |  16 ++
 .../pipeline/options.py                            |  62 ++++++
 .../pipeline/utils.py                              |  67 +++++++
 .../write_data_to_pubsub_pipeline/setup.py         |  42 ++++
 .../en/documentation/ml/online-clustering.md       | 219 +++++++++++++++++++++
 .../partials/section-menu/en/documentation.html    |   6 +
 16 files changed, 1026 insertions(+)

diff --git a/sdks/python/apache_beam/examples/inference/online_clustering/clustering_pipeline/__init__.py b/sdks/python/apache_beam/examples/inference/online_clustering/clustering_pipeline/__init__.py
new file mode 100644
index 00000000000..cce3acad34a
--- /dev/null
+++ b/sdks/python/apache_beam/examples/inference/online_clustering/clustering_pipeline/__init__.py
@@ -0,0 +1,16 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
diff --git a/sdks/python/apache_beam/examples/inference/online_clustering/clustering_pipeline/config.py b/sdks/python/apache_beam/examples/inference/online_clustering/clustering_pipeline/config.py
new file mode 100644
index 00000000000..a528d9a90ff
--- /dev/null
+++ b/sdks/python/apache_beam/examples/inference/online_clustering/clustering_pipeline/config.py
@@ -0,0 +1,28 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""The file defines global variables."""
+
+PROJECT_ID = "apache-beam-testing"
+# Subscription for PubSub Topic
+SUBSCRIPTION_ID = f"projects/{PROJECT_ID}/subscriptions/newsgroup-dataset-subscription"
+JOB_NAME = "online-clustering-birch"
+NUM_WORKERS = 1
+MODEL_NAME = "sentence-transformers-stsb-distilbert-base"
+TOKENIZER_NAME = "sentence-transformers/stsb-distilbert-base"
+MODEL_STATE_DICT_PATH = f"gs://{PROJECT_ID}-ml-examples/{MODEL_NAME}/pytorch_model.bin"
+MODEL_CONFIG_PATH = TOKENIZER_NAME
diff --git a/sdks/python/apache_beam/examples/inference/online_clustering/clustering_pipeline/main.py b/sdks/python/apache_beam/examples/inference/online_clustering/clustering_pipeline/main.py
new file mode 100644
index 00000000000..350eec7ed87
--- /dev/null
+++ b/sdks/python/apache_beam/examples/inference/online_clustering/clustering_pipeline/main.py
@@ -0,0 +1,122 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""This file contains the pipeline for doing online clustering."""
+import argparse
+import sys
+
+import apache_beam as beam
+import config as cfg
+from apache_beam.io.gcp.pubsub import ReadFromPubSub
+from apache_beam.ml.inference.base import KeyedModelHandler
+from apache_beam.ml.inference.base import RunInference
+from apache_beam.ml.inference.pytorch_inference import PytorchModelHandlerKeyedTensor
+from pipeline.options import get_pipeline_options
+from pipeline.transformations import Decode
+from pipeline.transformations import GetUpdates
+from pipeline.transformations import ModelWrapper
+from pipeline.transformations import NormalizeEmbedding
+from pipeline.transformations import StatefulOnlineClustering
+from pipeline.transformations import tokenize_sentence
+from transformers import AutoConfig
+
+
+def parse_arguments(argv):
+  """
+    It parses the arguments passed to the command line and returns them as an object
+
+    Args:
+      argv: The arguments passed to the command line.
+
+    Returns:
+      The arguments that are being passed in.
+    """
+  parser = argparse.ArgumentParser(description="online-clustering")
+
+  parser.add_argument(
+      "-m",
+      "--mode",
+      help="Mode to run pipeline in.",
+      choices=["local", "cloud"],
+      default="local",
+  )
+  parser.add_argument(
+      "-p",
+      "--project",
+      help="GCP project to run pipeline on.",
+      default=cfg.PROJECT_ID,
+  )
+
+  args, _ = parser.parse_known_args(args=argv)
+  return args
+
+
+# Can be removed once: https://github.com/apache/beam/issues/21863 is fixed
+class PytorchNoBatchModelHandler(PytorchModelHandlerKeyedTensor):
+  """Wrapper to PytorchModelHandler to limit batch size to 1.
+    The tokenized strings generated from BertTokenizer may have different
+    lengths, which doesn't work with torch.stack() in current RunInference
+    implementation since stack() requires tensors to be the same size.
+    Restricting max_batch_size to 1 means there is only 1 example per `batch`
+    in the run_inference() call.
+    """
+  def batch_elements_kwargs(self):
+    return {"max_batch_size": 1}
+
+
+def run():
+  """
+    It runs the pipeline.  It read from PubSub, decode the message,
+    tokenize the text, get the embedding, normalize the embedding,
+    map the document to a key, and then perform stateful clustering using Birch
+    """
+  args = parse_arguments(sys.argv)
+  pipeline_options = get_pipeline_options(
+      job_name=cfg.JOB_NAME,
+      num_workers=cfg.NUM_WORKERS,
+      project=args.project,
+      mode=args.mode,
+  )
+
+  model_handler = PytorchNoBatchModelHandler(
+      state_dict_path=cfg.MODEL_STATE_DICT_PATH,
+      model_class=ModelWrapper,
+      model_params={
+          "config": AutoConfig.from_pretrained(cfg.MODEL_CONFIG_PATH)
+      },
+      device="cpu",
+  )
+
+  with beam.Pipeline(options=pipeline_options) as pipeline:
+    docs = (
+        pipeline | "Read from PubSub" >> ReadFromPubSub(
+            subscription=cfg.SUBSCRIPTION_ID, with_attributes=True)
+        | "Decode PubSubMessage" >> beam.ParDo(Decode()))
+    normalized_embedding = (
+        docs | "Tokenize Text" >> beam.Map(tokenize_sentence)
+        | "Get Embedding" >> RunInference(KeyedModelHandler(model_handler))
+        | "Normalize Embedding" >> beam.ParDo(NormalizeEmbedding()))
+    clustering = (
+        normalized_embedding | "Map doc to key" >> beam.Map(lambda x: (1, x))
+        | "StatefulClustering using Birch" >> beam.ParDo(
+            StatefulOnlineClustering()))
+
+    _ = clustering | "Format Update" >> beam.ParDo(GetUpdates())
+
+
+if __name__ == "__main__":
+  run()
diff --git a/sdks/python/apache_beam/examples/inference/online_clustering/clustering_pipeline/pipeline/__init__.py b/sdks/python/apache_beam/examples/inference/online_clustering/clustering_pipeline/pipeline/__init__.py
new file mode 100644
index 00000000000..cce3acad34a
--- /dev/null
+++ b/sdks/python/apache_beam/examples/inference/online_clustering/clustering_pipeline/pipeline/__init__.py
@@ -0,0 +1,16 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
diff --git a/sdks/python/apache_beam/examples/inference/online_clustering/clustering_pipeline/pipeline/options.py b/sdks/python/apache_beam/examples/inference/online_clustering/clustering_pipeline/pipeline/options.py
new file mode 100644
index 00000000000..128c87e0872
--- /dev/null
+++ b/sdks/python/apache_beam/examples/inference/online_clustering/clustering_pipeline/pipeline/options.py
@@ -0,0 +1,63 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""This file contains the pipeline options to configure the Dataflow pipeline."""
+from datetime import datetime
+
+import config as cfg
+from apache_beam.options.pipeline_options import PipelineOptions
+
+
+def get_pipeline_options(
+    project: str,
+    job_name: str,
+    mode: str,
+    num_workers: int = cfg.NUM_WORKERS,
+    streaming: bool = True,
+) -> PipelineOptions:
+  """Function to retrieve the pipeline options.
+    Args:
+        project: GCP project to run on
+        mode: Indicator to run local, cloud or template
+        num_workers: Number of Workers for running the job parallely
+        max_num_workers: Maximum number of workers running the job parallely
+    Returns:
+        Dataflow pipeline options
+    """
+  job_name = f'{job_name}-{datetime.now().strftime("%Y%m%d%H%M%S")}'
+
+  staging_bucket = f"gs://{cfg.PROJECT_ID}-ml-examples"
+
+  # For a list of available options, check:
+  # https://cloud.google.com/dataflow/docs/guides/specifying-exec-params#setting-other-cloud-dataflow-pipeline-options
+  dataflow_options = {
+      "runner": "DirectRunner" if mode == "local" else "DataflowRunner",
+      "job_name": job_name,
+      "project": project,
+      "region": "us-central1",
+      "staging_location": f"{staging_bucket}/dflow-staging",
+      "temp_location": f"{staging_bucket}/dflow-temp",
+      # "save_main_session": False,
+      "setup_file": "./setup.py",
+      "streaming": streaming,
+  }
+
+  # Optional parameters
+  if num_workers:
+    dataflow_options.update({"num_workers": num_workers})
+
+  return PipelineOptions(flags=[], **dataflow_options)
diff --git a/sdks/python/apache_beam/examples/inference/online_clustering/clustering_pipeline/pipeline/transformations.py b/sdks/python/apache_beam/examples/inference/online_clustering/clustering_pipeline/pipeline/transformations.py
new file mode 100644
index 00000000000..26010516f0c
--- /dev/null
+++ b/sdks/python/apache_beam/examples/inference/online_clustering/clustering_pipeline/pipeline/transformations.py
@@ -0,0 +1,197 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""This file contains the transformations and utility functions for
+the online_clustering pipeline."""
+from collections import Counter
+from collections import defaultdict
+
+import numpy as np
+from sklearn.cluster import Birch
+
+import apache_beam as beam
+import config as cfg
+import torch
+from apache_beam.coders import PickleCoder
+from apache_beam.transforms.userstate import ReadModifyWriteStateSpec
+from transformers import AutoTokenizer
+from transformers import DistilBertModel
+
+Tokenizer = AutoTokenizer.from_pretrained(cfg.TOKENIZER_NAME)
+
+
+def tokenize_sentence(input_dict):
+  """
+    It takes a dictionary with a text and an id, tokenizes the text, and
+    returns a tuple of the text and id and the tokenized text
+
+    Args:
+      input_dict: a dictionary with the text and id of the sentence
+
+    Returns:
+      A tuple of the text and id, and a dictionary of the tokens.
+    """
+  text, uid = input_dict["text"], input_dict["id"]
+  tokens = Tokenizer([text], padding=True, truncation=True, return_tensors="pt")
+  tokens = {key: torch.squeeze(val) for key, val in tokens.items()}
+  return (text, uid), tokens
+
+
+class ModelWrapper(DistilBertModel):
+  """Wrapper to DistilBertModel to get embeddings when calling
+    forward function."""
+  def forward(self, **kwargs):
+    output = super().forward(**kwargs)
+    sentence_embedding = (
+        self.mean_pooling(output,
+                          kwargs["attention_mask"]).detach().cpu().numpy())
+    return sentence_embedding
+
+  # Mean Pooling - Take attention mask into account for correct averaging
+  def mean_pooling(self, model_output, attention_mask):
+    """
+        The function calculates the mean of token embeddings
+
+        Args:
+          model_output: The output of the model.
+          attention_mask: This is a tensor that contains 1s for all input tokens and
+          0s for all padding tokens.
+
+        Returns:
+          The mean of the token embeddings.
+        """
+    token_embeddings = model_output[
+        0]  # First element of model_output contains all token embeddings
+    input_mask_expanded = (
+        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float())
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
+        input_mask_expanded.sum(1), min=1e-9)
+
+
+class NormalizeEmbedding(beam.DoFn):
+  """A DoFn for normalization of text embedding."""
+  def process(self, element, *args, **kwargs):
+    """
+        For each element in the input PCollection, normalize the embedding vector, and
+        yield a new element with the normalized embedding added
+        Args:
+          element: The element to be processed.
+        """
+    (text, uid), prediction = element
+    embedding = prediction.inference
+    l2_norm = np.linalg.norm(embedding)
+    yield {"text": text, "id": uid, "embedding": embedding / l2_norm}
+
+
+class Decode(beam.DoFn):
+  """A DoFn for decoding PubSub message into a dictionary."""
+  def process(self, element, *args, **kwargs):
+    """
+        For each element in the input PCollection, retrieve the id and decode the bytes into string
+
+        Args:
+          element: The element that is being processed.
+        """
+    yield {
+        "text": element.data.decode("utf-8"),
+        "id": element.attributes["id"],
+    }
+
+
+class StatefulOnlineClustering(beam.DoFn):
+  """A DoFn for online clustering on vector embeddings."""
+
+  BIRCH_MODEL_SPEC = ReadModifyWriteStateSpec("clustering_model", PickleCoder())
+  DATA_ITEMS_SPEC = ReadModifyWriteStateSpec("data_items", PickleCoder())
+  EMBEDDINGS_SPEC = ReadModifyWriteStateSpec("embeddings", PickleCoder())
+  UPDATE_COUNTER_SPEC = ReadModifyWriteStateSpec(
+      "update_counter", PickleCoder())
+
+  # [START stateful_clustering]
+  def process(
+      self,
+      element,
+      model_state=beam.DoFn.StateParam(BIRCH_MODEL_SPEC),
+      collected_docs_state=beam.DoFn.StateParam(DATA_ITEMS_SPEC),
+      collected_embeddings_state=beam.DoFn.StateParam(EMBEDDINGS_SPEC),
+      update_counter_state=beam.DoFn.StateParam(UPDATE_COUNTER_SPEC),
+      *args,
+      **kwargs,
+  ):
+    """
+        Takes the embedding of a document and updates the clustering model
+
+        Args:
+          element: The input element to be processed.
+          model_state: This is the state of the clustering model. It is a stateful parameter,
+          which means that it will be updated after each call to the process function.
+          collected_docs_state: This is a stateful dictionary that stores the documents that
+          have been processed so far.
+          collected_embeddings_state: This is a dictionary of document IDs and their embeddings.
+          update_counter_state: This is a counter that keeps track of how many documents have been
+        processed.
+        """
+    # 1. Initialise or load states
+    clustering = model_state.read() or Birch(n_clusters=None, threshold=0.7)
+    collected_documents = collected_docs_state.read() or {}
+    collected_embeddings = collected_embeddings_state.read() or {}
+    update_counter = update_counter_state.read() or Counter()
+
+    # 2. Extract document, add to state, and add to clustering model
+    _, doc = element
+    doc_id = doc["id"]
+    embedding_vector = doc["embedding"]
+    collected_embeddings[doc_id] = embedding_vector
+    collected_documents[doc_id] = {"id": doc_id, "text": doc["text"]}
+    update_counter = len(collected_documents)
+
+    clustering.partial_fit(np.atleast_2d(embedding_vector))
+
+    # 3. Predict cluster labels of collected documents
+    cluster_labels = clustering.predict(
+        np.array(list(collected_embeddings.values())))
+
+    # 4. Write states
+    model_state.write(clustering)
+    collected_docs_state.write(collected_documents)
+    collected_embeddings_state.write(collected_embeddings)
+    update_counter_state.write(update_counter)
+    yield {
+        "labels": cluster_labels,
+        "docs": collected_documents,
+        "id": list(collected_embeddings.keys()),
+        "counter": update_counter,
+    }
+    # [END stateful_clustering]
+
+
+class GetUpdates(beam.DoFn):
+  """A DoFn for printing the clusters and items belonging to each cluster."""
+  def process(self, element, *args, **kwargs):
+    """
+        Prints and returns clusters with items contained in it
+        """
+    cluster_labels = element.get("labels")
+    doc_ids = element.get("id")
+    docs = element.get("docs")
+    print(f"Update Number: {element.get('counter')}:::\n")
+    label_items_map = defaultdict(list)
+    for doc_id, cluster_label in zip(doc_ids, cluster_labels):
+      label_items_map[cluster_label].append(docs[doc_id])
+    print(label_items_map)
+    print("\n\n\n\n")
+    yield label_items_map
diff --git a/sdks/python/apache_beam/examples/inference/online_clustering/clustering_pipeline/setup.py b/sdks/python/apache_beam/examples/inference/online_clustering/clustering_pipeline/setup.py
new file mode 100644
index 00000000000..51983469a11
--- /dev/null
+++ b/sdks/python/apache_beam/examples/inference/online_clustering/clustering_pipeline/setup.py
@@ -0,0 +1,44 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""Setup.py module for the workflow's worker utilities.
+All the workflow related code is gathered in a package that will be built as a
+source distribution, staged in the staging area for the workflow being run and
+then installed in the workers when they start running.
+This behavior is triggered by specifying the --setup_file command line option
+when running the workflow for remote execution.
+"""
+
+import setuptools
+from setuptools import find_packages
+
+REQUIREMENTS = [
+    "apache-beam[gcp]==2.40.0",
+    "transformers==4.21.1",
+    "torch==1.12.1",
+    "scikit-learn==1.0.2",
+]
+
+setuptools.setup(
+    name="catalog-dataflow-pipeline",
+    version="1.1.1",
+    install_requires=REQUIREMENTS,
+    packages=find_packages(),
+    author="Apache Software Foundation",
+    author_email="dev@beam.apache.org",
+    py_modules=["config"],
+)
diff --git a/sdks/python/apache_beam/examples/inference/online_clustering/write_data_to_pubsub_pipeline/__init__.py b/sdks/python/apache_beam/examples/inference/online_clustering/write_data_to_pubsub_pipeline/__init__.py
new file mode 100644
index 00000000000..cce3acad34a
--- /dev/null
+++ b/sdks/python/apache_beam/examples/inference/online_clustering/write_data_to_pubsub_pipeline/__init__.py
@@ -0,0 +1,16 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
diff --git a/sdks/python/apache_beam/examples/inference/online_clustering/write_data_to_pubsub_pipeline/config.py b/sdks/python/apache_beam/examples/inference/online_clustering/write_data_to_pubsub_pipeline/config.py
new file mode 100644
index 00000000000..93e0b03bd1b
--- /dev/null
+++ b/sdks/python/apache_beam/examples/inference/online_clustering/write_data_to_pubsub_pipeline/config.py
@@ -0,0 +1,23 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""The file defines global variables."""
+PROJECT_ID = "apache-beam-testing"
+# PubSub Topic ID
+TOPIC_ID = f"projects/{PROJECT_ID}/topics/newsgroup-dataset"
+JOB_NAME = "write-to-pub-sub"
+NUM_WORKERS = 1
diff --git a/sdks/python/apache_beam/examples/inference/online_clustering/write_data_to_pubsub_pipeline/main.py b/sdks/python/apache_beam/examples/inference/online_clustering/write_data_to_pubsub_pipeline/main.py
new file mode 100644
index 00000000000..19782e67751
--- /dev/null
+++ b/sdks/python/apache_beam/examples/inference/online_clustering/write_data_to_pubsub_pipeline/main.py
@@ -0,0 +1,89 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""This file contains the pipeline for writing twitter messages to PubSub."""
+import argparse
+import sys
+
+import apache_beam as beam
+import config as cfg
+from apache_beam.io.gcp.pubsub import WriteToPubSub
+from pipeline.options import get_pipeline_options
+from pipeline.utils import AssignUniqueID
+from pipeline.utils import ConvertToPubSubMessage
+from pipeline.utils import get_dataset
+
+
+def parse_arguments(argv):
+  """
+    It parses the arguments passed to the command line and returns them as an object
+
+    Args:
+      argv: The arguments passed to the command line.
+
+    Returns:
+      The arguments that are being passed in.
+    """
+  parser = argparse.ArgumentParser(description="write-to-pubsub")
+
+  parser.add_argument(
+      "-m",
+      "--mode",
+      help="Mode to run pipeline in.",
+      choices=["local", "cloud"],
+      default="local",
+  )
+  parser.add_argument(
+      "-p",
+      "--project",
+      help="GCP project to run pipeline on.",
+      default=cfg.PROJECT_ID,
+  )
+
+  args, _ = parser.parse_known_args(args=argv)
+  return args
+
+
+def run():
+  """
+    It runs the pipeline. It load the training data,
+    assign a unique ID to each document, convert it to a PubSub message, and
+    write it to PubSub
+    """
+  args = parse_arguments(sys.argv)
+  pipeline_options = get_pipeline_options(
+      job_name=cfg.JOB_NAME,
+      num_workers=cfg.NUM_WORKERS,
+      project=args.project,
+      mode=args.mode,
+  )
+  train_categories = ["joy", "love", "fear"]
+  train_data, _ = get_dataset(train_categories)
+
+  with beam.Pipeline(options=pipeline_options) as pipeline:
+    docs = (
+        pipeline | "Load Documents" >> beam.Create(train_data)
+        | "Assign unique key" >> beam.ParDo(AssignUniqueID()))
+    _ = (
+        docs
+        | "Convert to PubSub Message" >> beam.ParDo(ConvertToPubSubMessage())
+        | "Write to PubSub" >> WriteToPubSub(
+            topic=cfg.TOPIC_ID, with_attributes=True))
+
+
+if __name__ == "__main__":
+  run()
diff --git a/sdks/python/apache_beam/examples/inference/online_clustering/write_data_to_pubsub_pipeline/pipeline/__init__.py b/sdks/python/apache_beam/examples/inference/online_clustering/write_data_to_pubsub_pipeline/pipeline/__init__.py
new file mode 100644
index 00000000000..cce3acad34a
--- /dev/null
+++ b/sdks/python/apache_beam/examples/inference/online_clustering/write_data_to_pubsub_pipeline/pipeline/__init__.py
@@ -0,0 +1,16 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
diff --git a/sdks/python/apache_beam/examples/inference/online_clustering/write_data_to_pubsub_pipeline/pipeline/options.py b/sdks/python/apache_beam/examples/inference/online_clustering/write_data_to_pubsub_pipeline/pipeline/options.py
new file mode 100644
index 00000000000..bd5004322e6
--- /dev/null
+++ b/sdks/python/apache_beam/examples/inference/online_clustering/write_data_to_pubsub_pipeline/pipeline/options.py
@@ -0,0 +1,62 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""This file contains the pipeline options to configure the Dataflow pipeline."""
+from datetime import datetime
+
+import config as cfg
+from apache_beam.options.pipeline_options import PipelineOptions
+
+
+def get_pipeline_options(
+    project: str,
+    job_name: str,
+    mode: str,
+    num_workers: int = cfg.NUM_WORKERS,
+    streaming: bool = True,
+) -> PipelineOptions:
+  """Function to retrieve the pipeline options.
+    Args:
+        project: GCP project to run on
+        mode: Indicator to run local, cloud or template
+        num_workers: Number of Workers for running the job parallely
+        max_num_workers: Maximum number of workers running the job parallely
+    Returns:
+        Dataflow pipeline options
+    """
+  job_name = f'{job_name}-{datetime.now().strftime("%Y%m%d%H%M%S")}'
+
+  staging_bucket = f"gs://{cfg.PROJECT_ID}-ml-examples"
+
+  # For a list of available options, check:
+  # https://cloud.google.com/dataflow/docs/guides/specifying-exec-params#setting-other-cloud-dataflow-pipeline-options
+  dataflow_options = {
+      "runner": "DirectRunner" if mode == "local" else "DataflowRunner",
+      "job_name": job_name,
+      "project": project,
+      "region": "us-central1",
+      "staging_location": f"{staging_bucket}/dflow-staging",
+      "temp_location": f"{staging_bucket}/dflow-temp",
+      "setup_file": "./setup.py",
+      "streaming": streaming,
+  }
+
+  # Optional parameters
+  if num_workers:
+    dataflow_options.update({"num_workers": num_workers})
+
+  return PipelineOptions(flags=[], **dataflow_options)
diff --git a/sdks/python/apache_beam/examples/inference/online_clustering/write_data_to_pubsub_pipeline/pipeline/utils.py b/sdks/python/apache_beam/examples/inference/online_clustering/write_data_to_pubsub_pipeline/pipeline/utils.py
new file mode 100644
index 00000000000..c0007de63d2
--- /dev/null
+++ b/sdks/python/apache_beam/examples/inference/online_clustering/write_data_to_pubsub_pipeline/pipeline/utils.py
@@ -0,0 +1,67 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""This file contains the transformations and utility functions for
+the pipeline."""
+import uuid
+
+import numpy as np
+
+import apache_beam as beam
+from apache_beam.io.gcp.pubsub import PubsubMessage
+from datasets import load_dataset
+
+
+def get_dataset(categories: list, split: str = "train"):
+  """
+    It takes a list of categories and a split (train/test/dev) and returns the
+    corresponding subset of the dataset
+
+    Args:
+      categories (list): list of emotion categories to use
+      split (str): The split of the dataset to use. Can be either "train", "dev", or "test".
+      Defaults to train
+
+    Returns:
+      A list of text and a list of labels
+    """
+  labels = ["sadness", "joy", "love", "anger", "fear", "surprise"]
+  label_map = {
+      class_name: class_id
+      for class_id, class_name in enumerate(labels)
+  }
+  labels_subset = np.array([label_map[class_name] for class_name in categories])
+  emotion_dataset = load_dataset("emotion", download_mode="force_redownload")
+  X, y = np.array(emotion_dataset[split]["text"]), np.array(
+      emotion_dataset[split]["label"])
+  subclass_idxs = [idx for idx, label in enumerate(y) if label in labels_subset]
+  X_subset, y_subset = X[subclass_idxs], y[subclass_idxs]
+  return X_subset.tolist(), y_subset.tolist()
+
+
+class AssignUniqueID(beam.DoFn):
+  """A DoFn for assigning Unique ID to each text."""
+  def process(self, element, *args, **kwargs):
+    uid = str(uuid.uuid4())
+    yield {"id": uid, "text": element}
+
+
+class ConvertToPubSubMessage(beam.DoFn):
+  """A DoFn for converting into PubSub message format."""
+  def process(self, element, *args, **kwargs):
+    yield PubsubMessage(
+        data=element["text"].encode("utf-8"), attributes={"id": element["id"]})
diff --git a/sdks/python/apache_beam/examples/inference/online_clustering/write_data_to_pubsub_pipeline/setup.py b/sdks/python/apache_beam/examples/inference/online_clustering/write_data_to_pubsub_pipeline/setup.py
new file mode 100644
index 00000000000..c1da9ab454d
--- /dev/null
+++ b/sdks/python/apache_beam/examples/inference/online_clustering/write_data_to_pubsub_pipeline/setup.py
@@ -0,0 +1,42 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""Setup.py module for the workflow's worker utilities.
+All the workflow related code is gathered in a package that will be built as a
+source distribution, staged in the staging area for the workflow being run and
+then installed in the workers when they start running.
+This behavior is triggered by specifying the --setup_file command line option
+when running the workflow for remote execution.
+"""
+
+import setuptools
+from setuptools import find_packages
+
+REQUIREMENTS = [
+    "apache-beam[gcp]==2.40.0",
+    "datasets==2.4.0",
+]
+
+setuptools.setup(
+    name="write-to-pubsub-pipeline",
+    version="1.1.1",
+    install_requires=REQUIREMENTS,
+    packages=find_packages(),
+    author="Apache Software Foundation",
+    author_email="dev@beam.apache.org",
+    py_modules=["config"],
+)
diff --git a/website/www/site/content/en/documentation/ml/online-clustering.md b/website/www/site/content/en/documentation/ml/online-clustering.md
new file mode 100644
index 00000000000..98eeec2bcd6
--- /dev/null
+++ b/website/www/site/content/en/documentation/ml/online-clustering.md
@@ -0,0 +1,219 @@
+---
+title: "Online Clustering"
+---
+<!--
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# OnlineClustering Example
+
+The OnlineClustering example demonstrates how to setup a realtime clustering pipeline that can read text from PubSub, convert the text into an embedding using a language model, and cluster them using BIRCH.
+
+### Dataset for Clustering
+For the example, we use a dataset called [emotion](https://huggingface.co/datasets/emotion). It comprises of 20,000 English Twitter messages with 6 basic emotions: anger, fear, joy, love, sadness, and surprise. The dataset has three splits: train, validation and test. It is a supervised dataset as it contains the text and the category(class) of the dataset. This dataset can easily be accessed using [HuggingFace Datasets](https://huggingface.co/docs/datasets/index).
+
+To have a better understanding of the dataset, here are some examples from the train split of the dataset:
+
+
+| Text        | Type of emotion |
+| :---        |    :----:   |
+| im grabbing a minute to post i feel greedy wrong      | Anger       |
+| i am ever feeling nostalgic about the fireplace i will know that it is still on the property   | Love        |
+| ive been taking or milligrams or times recommended amount and ive fallen asleep a lot faster but i also feel like so funny | Fear |
+| on a boat trip to denmark | Joy |
+| i feel you know basically like a fake in the realm of science fiction | Sadness |
+| i began having them several times a week feeling tortured by the hallucinations moving people and figures sounds and vibrations | Fear |
+
+### Clustering Algorithm
+For the clustering of tweets, we use an incremental clustering algorithm called BIRCH. It stands for balanced iterative reducing and clustering using hierarchies and is an unsupervised data mining algorithm used to perform hierarchical clustering over particularly large data-sets. An advantage of BIRCH is its ability to incrementally and dynamically cluster incoming, multi-dimensional metric data points in an attempt to produce the best quality clustering for a given set of resources (me [...]
+
+
+## Ingestion to PubSub
+We first ingest the data into [PubSub](https://cloud.google.com/pubsub/docs/overview) so that while clustering we can read the tweets from PubSub. PubSub is a messaging service for exchanging event data among applications and services. It is used for streaming analytics and data integration pipelines to ingest and distribute data.
+
+The full example code for ingesting data to PubSub can be found [here](sdks/python/apache_beam/examples/inference/online_clustering/write_data_to_pubsub_pipeline/)
+
+The file structure for ingestion pipeline is:
+
+    write_data_to_pubsub_pipeline/
+    ├── pipeline/
+    │   ├── __init__.py
+    │   ├── options.py
+    │   └── utils.py
+    ├── __init__.py
+    ├── config.py
+    ├── main.py
+    └── setup.py
+
+`pipeline/utils.py` contains the code for loading the emotion dataset and two `beam.DoFn` that are used for data transformation
+
+`pipeline/options.py` contains the pipeline options to configure the Dataflow pipeline
+
+`config.py` defines some variables like GCP PROJECT_ID, NUM_WORKERS that are used multiple times
+
+`setup.py` defines the packages/requirements for the pipeline to run
+
+`main.py` contains the pipeline code and some additional function used for running the pipeline
+
+### How to Run the Pipeline ?
+First, make sure you have installed the required packages.
+
+1. Locally on your machine: `python main.py`
+2. On GCP for Dataflow: `python main.py --mode cloud`
+
+
+The `write_data_to_pubsub_pipeline` contains four different transforms:
+1. Load emotion dataset using HuggingFace Datasets (We take samples from 3 classes instead of 6 for simplicity)
+2. Associate each text with a unique identifier (UID)
+3. Convert the text into a format PubSub is expecting
+4. Write the formatted message to PubSub
+
+
+## Clustering on Streaming Data
+
+After having the data ingested to PubSub, we can now look into the second pipeline, where we read the streaming message from PubSub, convert the text to a embedding using a language model and cluster them using BIRCH.
+
+The full example code for all the steps mentioned above can be found [here](sdks/python/apache_beam/examples/inference/online_clustering/clustering_pipeline/).
+
+
+The file structure for clustering_pipeline is:
+
+    clustering_pipeline/
+    ├── pipeline/
+    │   ├── __init__.py
+    │   ├── options.py
+    │   └── transformations.py
+    ├── __init__.py
+    ├── config.py
+    ├── main.py
+    └── setup.py
+
+`pipeline/transformations.py` contains the code for different `beam.DoFn` that are used in the pipeline
+
+`pipeline/options.py` contains the pipeline options to configure the Dataflow pipeline
+
+`config.py` defines some variables like GCP PROJECT_ID, NUM_WORKERS that are used multiple times
+
+`setup.py` defines the packages/requirements for the pipeline to run
+
+`main.py` contains the pipeline code and some additional function used for running the pipeline
+
+### How to Run the Pipeline ?
+First, make sure you have installed the required packages and you have pushed data to PubSub.
+
+1. Locally on your machine: `python main.py`
+2. On GCP for Dataflow: `python main.py --mode cloud`
+
+The pipeline can be broken down into few simple steps:
+
+1. Reading the message from PubSub
+2. Converting the PubSub message into a PCollection of dictionary where key is UID and value is twitter text
+3. Encoding the text into transformer-readable token ID integers using a tokenizer
+4. Using RunInference to get the vector embedding from a Transformer based Language Model
+5. Normalizing the embedding for Clustering
+6. Performing BIRCH Clustering using Stateful Processing
+7. Printing the texts assigned to clusters
+
+The code snippet for first two steps of pipeline where message from PubSub is read and converted into a dictionary
+
+{{< highlight >}}
+    docs = (
+        pipeline
+        | "Read from PubSub"
+        >> ReadFromPubSub(subscription=cfg.SUBSCRIPTION_ID, with_attributes=True)
+        | "Decode PubSubMessage" >> beam.ParDo(Decode())
+    )
+{{< /highlight >}}
+
+
+We now closely look at three important steps of pipeline where we tokenize the text, fed the tokenized text to get embedding from a Transformer based Language Model and performing clustering using [Stateful Processing](https://beam.apache.org/blog/stateful-processing/).
+
+
+### Getting Embedding from a Language Model
+
+In order to do clustering with text data, we first need to map the text into vectors of numerical values suitable for statistical analysis. We use a transformer based language model called [sentence-transformers/stsb-distilbert-base/stsb-distilbert-base](https://huggingface.co/sentence-transformers/stsb-distilbert-base). It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search. But, we first need to tokenize the t [...]
+
+Tokenization can be seen as a preprocessing task as it transforms text in a way that it can be fed into the model for getting predictions.
+
+{{< highlight >}}
+    normalized_embedding = (
+        docs
+        | "Tokenize Text" >> beam.Map(tokenize_sentence)
+{{< /highlight >}}
+
+Here, `tokenize_sentence` is a function that takes a dictionary with a text and an id, tokenizes the text, and returns a tuple (text, id) and the tokenized output.
+
+
+Tokenized output is then passed to the language model for getting the embeddings. For getting embeddings from language model, we use `RunInference()` from beam.
+
+{{< highlight >}}
+    | "Get Embedding" >> RunInference(KeyedModelHandler(model_handler))
+
+{{< /highlight >}}
+
+After getting the embedding for each twitter text, the embeddings are normalized as it helps to make better clusters.
+
+{{< highlight >}}
+    | "Normalize Embedding" >> beam.ParDo(NormalizeEmbedding())
+
+{{< /highlight >}}
+
+
+### StatefulOnlineClustering
+As the data is coming in a streaming fashion, so to cluster them we need an iterative clustering algorithm like BIRCH. As, the algorithm is iterative, we need a mechanism to store the previous state so that when a twitter text arrives, it can be updated accordingly.  **Stateful Processing** enables a `DoFn` to have persistent state which can be read and written during the processing of each element. One can read about Stateful Processing in the official documentation from Beam: [Link](ht [...]
+
+In this example, every time a new message is Read from PubSub, we retrieve the existing state of the clustering model, update it and write it back to the state.
+
+{{< highlight >}}
+    clustering = (
+        normalized_embedding
+        | "Map doc to key" >> beam.Map(lambda x: (1, x))
+        | "StatefulClustering using Birch" >> beam.ParDo(StatefulOnlineClustering())
+    )
+{{< /highlight >}}
+
+As BIRCH doesn't support parallelization, so we need to make sure that all the StatefulProcessing is taking place only by one worker. In order to do that, we use the `Beam.Map` to associate each text to the same key `1`.
+
+`StatefulOnlineClustering` is a `DoFn` that an embedding of a text and updates the clustering model. For storing the state it uses `ReadModifyWriteStateSpec` state object that acts as a container for storage.
+
+{{< highlight >}}
+
+class StatefulOnlineClustering(beam.DoFn):
+
+    BIRCH_MODEL_SPEC = ReadModifyWriteStateSpec("clustering_model", PickleCoder())
+    DATA_ITEMS_SPEC = ReadModifyWriteStateSpec("data_items", PickleCoder())
+    EMBEDDINGS_SPEC = ReadModifyWriteStateSpec("embeddings", PickleCoder())
+    UPDATE_COUNTER_SPEC = ReadModifyWriteStateSpec("update_counter", PickleCoder())
+
+{{< /highlight >}}
+
+We declare four different `ReadModifyWriteStateSpec objects`:
+
+* `BIRCH_MODEL_SPEC`: holds the state of clustering model
+* `DATA_ITEMS_SPEC`: holds the twitter texts seen so far
+* `EMBEDDINGS_SPEC`: holds the normalized embeddings
+* `UPDATE_COUNTER_SPEC`: holds the number of texts processed
+
+
+These `ReadModifyWriteStateSpec objects` are passed as an additional argument to the `process` function. When a news item comes in, we retrieve the existing state of the different objects, update them and then write them back as persistent shared state.
+
+{{< highlight file="sdks/python/apache_beam/examples/inference/online_clustering/clustering_pipeline/pipeline/transformations.py" >}}
+{{< code_sample "sdks/python/apache_beam/examples/inference/online_clustering/clustering_pipeline/pipeline/transformations.py" stateful_clustering >}}
+{{< /highlight >}}
+
+
+`GetUpdates` is a `DoFn` that prints the cluster assigned to each twitter message, every time a new message arrives.
+
+{{< highlight >}}
+updated_clusters = clustering | "Format Update" >> beam.ParDo(GetUpdates())
+{{< /highlight >}}
\ No newline at end of file
diff --git a/website/www/site/layouts/partials/section-menu/en/documentation.html b/website/www/site/layouts/partials/section-menu/en/documentation.html
index e6d1d5c742f..6a6ff67e032 100644
--- a/website/www/site/layouts/partials/section-menu/en/documentation.html
+++ b/website/www/site/layouts/partials/section-menu/en/documentation.html
@@ -209,6 +209,12 @@
     <li><a href="/documentation/patterns/grouping-elements-for-efficient-external-service-calls/">Grouping elements for efficient external service calls</a></li>
   </ul>
 </li>
+<li class="section-nav-item--collapsible">
+  <span class="section-nav-list-title">AI/ML pipelines</span>
+  <ul class="section-nav-list">
+    <li><a href="/documentation/ml/online-clustering/">Online Clustering</a></li>
+  </ul>
+</li>
 <li class="section-nav-item--collapsible">
   <span class="section-nav-list-title">Runtime systems</span>