You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@beam.apache.org by da...@apache.org on 2017/04/06 14:33:20 UTC

[1/7] beam git commit: HadoopInputFormatIO with junits

Repository: beam
Updated Branches:
  refs/heads/master 9c284d625 -> 82694fe72


http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/elasticsearch/LargeProductionCluster/es-services.yaml
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/elasticsearch/LargeProductionCluster/es-services.yaml b/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/elasticsearch/LargeProductionCluster/es-services.yaml
new file mode 100644
index 0000000..38c820e
--- /dev/null
+++ b/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/elasticsearch/LargeProductionCluster/es-services.yaml
@@ -0,0 +1,277 @@
+#    Licensed to the Apache Software Foundation (ASF) under one or more
+#    contributor license agreements.  See the NOTICE file distributed with
+#    this work for additional information regarding copyright ownership.
+#    The ASF licenses this file to You under the Apache License, Version 2.0
+#    (the "License"); you may not use this file except in compliance with
+#    the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+# Service file containing services for ES discovery, elasticsearch and master node deployment.
+
+# Kubernetes headless service for Elasticsearch discovery of nodes.
+apiVersion: v1
+kind: Service
+metadata:
+  name: elasticsearch-discovery
+  labels:
+    component: elasticsearch
+    role: master
+spec:
+  selector:
+    component: elasticsearch
+    role: master
+  ports:
+  - name: transport
+    port: 9300
+    protocol: TCP
+---
+# To create Elasticsearch frontend cluster Kubernetes service.
+# It sets up a load balancer on TCP port 9200 that distributes network traffic to the ES client nodes.
+apiVersion: v1
+kind: Service
+metadata:
+  name: elasticsearch
+  labels:
+    component: elasticsearch
+    role: client
+spec:
+  type: LoadBalancer
+  selector:
+    component: elasticsearch
+    role: client
+  ports:
+  - name: http
+    port: 9200
+    protocol: TCP
+---
+# The Kubernetes deployment script for Elasticsearch master nodes.
+apiVersion: extensions/v1beta1
+kind: Deployment
+metadata:
+  name: es-master
+  labels:
+    component: elasticsearch
+    role: master
+spec:
+  replicas: 3
+  template:
+    metadata:
+      labels:
+        component: elasticsearch
+        role: master
+      annotations:
+        pod.beta.kubernetes.io/init-containers: '[
+          {
+          "name": "sysctl",
+            "image": "busybox",
+            "imagePullPolicy": "IfNotPresent",
+            "command": ["sysctl", "-w", "vm.max_map_count=262144"],
+            "securityContext": {
+              "privileged": true
+            }
+          }
+        ]'
+    spec:
+      containers:
+      - name: es-master
+        securityContext:
+          privileged: false
+          capabilities:
+            add:
+# IPC_LOCK capability is enabled to allow Elasticsearch to lock the heap in memory so it will not be swapped.
+              - IPC_LOCK
+# SYS_RESOURCE is docker capability key to control and override the resource limits.
+# This could be needed to increase base limits.(e.g. File descriptor limit for elasticsearch)
+              - SYS_RESOURCE
+        image: quay.io/pires/docker-elasticsearch-kubernetes:5.2.2
+        env:
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: NODE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.name
+        - name: "CLUSTER_NAME"
+          value: "myesdb"
+        - name: "NUMBER_OF_MASTERS"
+          value: "2"
+        - name: NODE_MASTER
+          value: "true"
+        - name: NODE_INGEST
+          value: "false"
+        - name: NODE_DATA
+          value: "false"
+        - name: HTTP_ENABLE
+          value: "false"
+        - name: "ES_JAVA_OPTS"
+          value: "-Xms256m -Xmx256m"
+        ports:
+        - containerPort: 9300
+          name: transport
+          protocol: TCP
+        volumeMounts:
+        - name: storage
+          mountPath: /data
+      volumes:
+          - emptyDir:
+              medium: ""
+            name: "storage"
+---
+# Kubernetes deployment script for Elasticsearch client nodes (aka load balancing proxies).
+apiVersion: extensions/v1beta1
+kind: Deployment
+metadata:
+  name: es-client
+  labels:
+    component: elasticsearch
+    role: client
+spec:
+  # The no. of replicas can be incremented based on the client usage using HTTP API.
+  replicas: 1
+  template:
+    metadata:
+      labels:
+        component: elasticsearch
+        role: client
+      annotations:
+      # Elasticsearch uses a hybrid mmapfs / niofs directory by default to store its indices.
+      # The default operating system limits on mmap counts is likely to be too low, which may result
+      # in out of memory exceptions. Therefore, the need to increase virtual memory
+      # vm.max_map_count for large amount of data in the pod initialization annotation.
+        pod.beta.kubernetes.io/init-containers: '[
+          {
+          "name": "sysctl",
+            "image": "busybox",
+            "imagePullPolicy": "IfNotPresent",
+            "command": ["sysctl", "-w", "vm.max_map_count=262144"],
+            "securityContext": {
+              "privileged": true
+            }
+          }
+        ]'
+    spec:
+      containers:
+      - name: es-client
+        securityContext:
+          privileged: false
+          capabilities:
+            add:
+# IPC_LOCK capability is enabled to allow Elasticsearch to lock the heap in memory so it will not be swapped.
+              - IPC_LOCK
+# SYS_RESOURCE is docker capability key to control and override the resource limits.
+# This could be needed to increase base limits.(e.g. File descriptor limit for elasticsearch)
+              - SYS_RESOURCE
+        image: quay.io/pires/docker-elasticsearch-kubernetes:5.2.2
+        env:
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: NODE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.name
+        - name: "CLUSTER_NAME"
+          value: "myesdb"
+        - name: NODE_MASTER
+          value: "false"
+        - name: NODE_DATA
+          value: "false"
+        - name: HTTP_ENABLE
+          value: "true"
+        - name: "ES_JAVA_OPTS"
+          value: "-Xms256m -Xmx256m"
+        ports:
+        - containerPort: 9200
+          name: http
+          protocol: TCP
+        - containerPort: 9300
+          name: transport
+          protocol: TCP
+        volumeMounts:
+        - name: storage
+          mountPath: /data
+      volumes:
+          - emptyDir:
+              medium: ""
+            name: "storage"
+---
+# Kubernetes deployment script for Elasticsearch data nodes which store and index data.
+apiVersion: extensions/v1beta1
+kind: Deployment
+metadata:
+  name: es-data
+  labels:
+    component: elasticsearch
+    role: data
+spec:
+  replicas: 2
+  template:
+    metadata:
+      labels:
+        component: elasticsearch
+        role: data
+      annotations:
+        pod.beta.kubernetes.io/init-containers: '[
+          {
+          "name": "sysctl",
+            "image": "busybox",
+            "imagePullPolicy": "IfNotPresent",
+            "command": ["sysctl", "-w", "vm.max_map_count=1048575"],
+            "securityContext": {
+              "privileged": true
+            }
+          }
+        ]'
+    spec:
+      containers:
+      - name: es-data
+        securityContext:
+          privileged: false
+          capabilities:
+            add:
+# IPC_LOCK capability is enabled to allow Elasticsearch to lock the heap in memory so it will not be swapped.
+              - IPC_LOCK
+# SYS_RESOURCE is docker capability key to control and override the resource limits.
+# This could be needed to increase base limits.(e.g. File descriptor limit for elasticsearch)
+              - SYS_RESOURCE
+        image: quay.io/pires/docker-elasticsearch-kubernetes:5.2.2
+        env:
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: NODE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.name
+        - name: "CLUSTER_NAME"
+          value: "myesdb"
+        - name: NODE_MASTER
+          value: "false"
+        - name: NODE_INGEST
+          value: "false"
+        - name: HTTP_ENABLE
+          value: "false"
+        - name: "ES_JAVA_OPTS"
+          value: "-Xms256m -Xmx256m"
+        ports:
+        - containerPort: 9300
+          name: transport
+          protocol: TCP
+        volumeMounts:
+        - name: storage
+          mountPath: /data
+      volumes:
+          - emptyDir:
+              medium: ""
+            name: "storage"
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/elasticsearch/LargeProductionCluster/start-up.sh
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/elasticsearch/LargeProductionCluster/start-up.sh b/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/elasticsearch/LargeProductionCluster/start-up.sh
new file mode 100644
index 0000000..4d277c8
--- /dev/null
+++ b/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/elasticsearch/LargeProductionCluster/start-up.sh
@@ -0,0 +1,21 @@
+#    Licensed to the Apache Software Foundation (ASF) under one or more
+#    contributor license agreements.  See the NOTICE file distributed with
+#    this work for additional information regarding copyright ownership.
+#    The ASF licenses this file to You under the Apache License, Version 2.0
+#    (the "License"); you may not use this file except in compliance with
+#    the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+#
+
+#!/bin/sh
+set -e
+
+# Create Elasticsearch services and deployments.
+kubectl create -f es-services.yaml

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/elasticsearch/LargeProductionCluster/teardown.sh
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/elasticsearch/LargeProductionCluster/teardown.sh b/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/elasticsearch/LargeProductionCluster/teardown.sh
new file mode 100644
index 0000000..a30793b
--- /dev/null
+++ b/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/elasticsearch/LargeProductionCluster/teardown.sh
@@ -0,0 +1,20 @@
+#    Licensed to the Apache Software Foundation (ASF) under one or more
+#    contributor license agreements.  See the NOTICE file distributed with
+#    this work for additional information regarding copyright ownership.
+#    The ASF licenses this file to You under the Apache License, Version 2.0
+#    (the "License"); you may not use this file except in compliance with
+#    the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+#!/bin/bash
+set -e
+
+# Delete elasticsearch services and deployments.
+kubectl delete -f es-services.yaml
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/elasticsearch/SmallITCluster/elasticsearch-svc-rc.yaml
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/elasticsearch/SmallITCluster/elasticsearch-svc-rc.yaml b/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/elasticsearch/SmallITCluster/elasticsearch-svc-rc.yaml
new file mode 100644
index 0000000..9a7ac3d
--- /dev/null
+++ b/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/elasticsearch/SmallITCluster/elasticsearch-svc-rc.yaml
@@ -0,0 +1,84 @@
+#    Licensed to the Apache Software Foundation (ASF) under one or more
+#    contributor license agreements.  See the NOTICE file distributed with
+#    this work for additional information regarding copyright ownership.
+#    The ASF licenses this file to You under the Apache License, Version 2.0
+#    (the "License"); you may not use this file except in compliance with
+#    the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+# To create Elasticsearch frontend cluster Kubernetes service. 
+# It sets up a load balancer on TCP port 9200 that distributes network traffic to the ES nodes.
+apiVersion: v1
+kind: Service
+metadata:
+  name: elasticsearch
+  labels:
+    component: elasticsearch
+spec:
+  type: LoadBalancer
+  selector:
+    component: elasticsearch
+  ports:
+  - name: http
+    port: 9200
+    protocol: TCP
+  - name: transport
+    port: 9300
+    protocol: TCP
+---
+# The Kubernetes deployment script for Elasticsearch replication nodes. It will create 1 node cluster.
+# To scale the cluster as desired, you can create replicas of node use 'kubectl scale --replicas=3 rc es' command
+apiVersion: extensions/v1beta1
+kind: Deployment
+metadata:
+  name: es
+  labels:
+    component: elasticsearch
+spec:
+  replicas: 1
+  template:
+    metadata:
+      labels:
+        component: elasticsearch
+    spec:
+      containers:
+      - name: es
+        securityContext:
+          capabilities:
+            add:
+# IPC_LOCK capability is enabled to allow Elasticsearch to lock the heap in memory so it will not be swapped.   
+              - IPC_LOCK
+# SYS_RESOURCE capability is set to control and override various resource limits.
+              - SYS_RESOURCE
+        image: quay.io/pires/docker-elasticsearch-kubernetes:5.2.2
+        env:
+        - name: "CLUSTER_NAME"
+          value: "myesdb"
+        - name: "DISCOVERY_SERVICE"
+          value: "elasticsearch"
+        - name: NODE_MASTER
+          value: "true"
+        - name: NODE_DATA
+          value: "true"
+        - name: HTTP_ENABLE
+          value: "true"
+        ports:
+        - containerPort: 9200
+          name: http
+          protocol: TCP
+        - containerPort: 9300
+          name: transport
+          protocol: TCP
+        volumeMounts:
+        - mountPath: /data
+          name: storage
+      volumes:
+      - name: storage
+        emptyDir: {}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/elasticsearch/SmallITCluster/start-up.sh
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/elasticsearch/SmallITCluster/start-up.sh b/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/elasticsearch/SmallITCluster/start-up.sh
new file mode 100644
index 0000000..e8cf275
--- /dev/null
+++ b/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/elasticsearch/SmallITCluster/start-up.sh
@@ -0,0 +1,22 @@
+#    Licensed to the Apache Software Foundation (ASF) under one or more
+#    contributor license agreements.  See the NOTICE file distributed with
+#    this work for additional information regarding copyright ownership.
+#    The ASF licenses this file to You under the Apache License, Version 2.0
+#    (the "License"); you may not use this file except in compliance with
+#    the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+#
+
+#!/bin/sh
+set -e
+
+# Create Elasticsearch services and deployments.
+kubectl create -f elasticsearch-svc-rc.yaml
+

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/elasticsearch/SmallITCluster/teardown.sh
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/elasticsearch/SmallITCluster/teardown.sh b/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/elasticsearch/SmallITCluster/teardown.sh
new file mode 100644
index 0000000..079141d
--- /dev/null
+++ b/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/elasticsearch/SmallITCluster/teardown.sh
@@ -0,0 +1,20 @@
+#    Licensed to the Apache Software Foundation (ASF) under one or more
+#    contributor license agreements.  See the NOTICE file distributed with
+#    this work for additional information regarding copyright ownership.
+#    The ASF licenses this file to You under the Apache License, Version 2.0
+#    (the "License"); you may not use this file except in compliance with
+#    the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+#!/bin/bash
+set -e
+
+# Delete elasticsearch services and deployments.
+kubectl delete -f elasticsearch-svc-rc.yaml

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/elasticsearch/data-load-setup.sh
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/elasticsearch/data-load-setup.sh b/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/elasticsearch/data-load-setup.sh
new file mode 100644
index 0000000..00991bc
--- /dev/null
+++ b/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/elasticsearch/data-load-setup.sh
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/bin/bash
+set -e
+
+# Install python
+sudo apt-get update
+sudo apt-get install python-pip
+sudo pip install --upgrade pip
+sudo apt-get install python-dev
+sudo pip install tornado numpy
+echo

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/elasticsearch/data-load.sh
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/elasticsearch/data-load.sh b/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/elasticsearch/data-load.sh
new file mode 100644
index 0000000..21150fb
--- /dev/null
+++ b/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/elasticsearch/data-load.sh
@@ -0,0 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/bin/bash
+set -e
+
+# Identify external IP
+external_ip="$(kubectl get svc elasticsearch -o jsonpath='{.status.loadBalancer.ingress[0].ip}')"
+echo "Waiting for the Elasticsearch service to come up ........"
+while [ -z "$external_ip" ]
+do
+   sleep 10s
+   external_ip="$(kubectl get svc elasticsearch -o jsonpath='{.status.loadBalancer.ingress[0].ip}')"
+   echo "."
+done
+echo "External IP - $external_ip"
+echo
+
+# Run the script
+/usr/bin/python es_test_data.py --count=1000 --format=Txn_ID:int,Item_Code:int,Item_ID:int,User_Name:str,last_updated:ts,Price:int,Title:str,Description:str,Age:int,Item_Name:str,Item_Price:int,Availability:bool,Batch_Num:int,Last_Ordered:tstxt,City:text --es_url=http://$external_ip:9200 &
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/elasticsearch/es_test_data.py
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/elasticsearch/es_test_data.py b/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/elasticsearch/es_test_data.py
new file mode 100644
index 0000000..1658e2c
--- /dev/null
+++ b/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/elasticsearch/es_test_data.py
@@ -0,0 +1,299 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Script to populate data on Elasticsearch
+# Hashcode for 1000 records is ed36c09b5e24a95fd8d3cc711a043a85320bb47d, 
+# For test with query to select one record from 1000 docs, 
+# hashcode is 83c108ff81e87b6f3807c638e6bb9a9e3d430dc7
+# Hashcode for 50m records (~20 gigs) is aff7390ee25c4c330f0a58dfbfe335421b11e405 
+#!/usr/bin/python
+
+import json
+import time
+import logging
+import random
+import string
+import uuid
+import datetime
+
+import tornado.gen
+import tornado.httpclient
+import tornado.ioloop
+import tornado.options
+
+async_http_client = tornado.httpclient.AsyncHTTPClient()
+id_counter = 0
+upload_data_count = 0
+_dict_data = None
+
+
+
+def delete_index(idx_name):
+    try:
+        url = "%s/%s?refresh=true" % (tornado.options.options.es_url, idx_name)
+        request = tornado.httpclient.HTTPRequest(url, method="DELETE", request_timeout=240, 
+                                                 auth_username=tornado.options.options.username, 
+                                                 auth_password=tornado.options.options.password)
+        response = tornado.httpclient.HTTPClient().fetch(request)
+        logging.info('Deleting index  "%s" done   %s' % (idx_name, response.body))
+    except tornado.httpclient.HTTPError:
+        pass
+
+
+def create_index(idx_name):
+    schema = {
+        "settings": {
+            "number_of_shards":   tornado.options.options.num_of_shards,
+            "number_of_replicas": tornado.options.options.num_of_replicas
+        },
+        "refresh": True
+    }
+
+    body = json.dumps(schema)
+    url = "%s/%s" % (tornado.options.options.es_url, idx_name)
+    try:
+        logging.info('Trying to create index %s' % (url))
+        request = tornado.httpclient.HTTPRequest(url, method="PUT", body=body, request_timeout=240,
+                                                 auth_username=tornado.options.options.username, 
+                                                 auth_password=tornado.options.options.password)
+        response = tornado.httpclient.HTTPClient().fetch(request)
+        logging.info('Creating index "%s" done   %s' % (idx_name, response.body))
+    except tornado.httpclient.HTTPError:
+        logging.info('Looks like the index exists already')
+        pass
+
+
+@tornado.gen.coroutine
+def upload_batch(upload_data_txt):
+    try:
+        request = tornado.httpclient.HTTPRequest(tornado.options.options.es_url + "/_bulk",
+                                                 method="POST", body=upload_data_txt,
+                                                 request_timeout=
+                                                 tornado.options.options.http_upload_timeout,
+                                                 auth_username=tornado.options.options.username, 
+                                                 auth_password=tornado.options.options.password)
+        response = yield async_http_client.fetch(request)
+    except Exception as ex:
+        logging.error("upload failed, error: %s" % ex)
+        return
+
+    result = json.loads(response.body.decode('utf-8'))
+    res_txt = "OK" if not result['errors'] else "FAILED"
+    took = int(result['took'])
+    logging.info("Upload: %s - upload took: %5dms, total docs uploaded: %7d" % (res_txt, took, 
+                                                                                upload_data_count))
+
+
+def get_data_for_format(format,count):
+    split_f = format.split(":")
+    if not split_f:
+        return None, None
+
+    field_name = split_f[0]
+    field_type = split_f[1]
+
+    return_val = ''
+
+    if field_type == "bool":
+        if count%2 == 0:
+           return_val = True
+        else:
+           return_val = False
+
+    elif field_type == "str":
+        return_val = field_name + str(count)
+
+    elif field_type == "int":
+        return_val = count
+    
+    elif field_type == "ipv4":
+        return_val = "{0}.{1}.{2}.{3}".format(1,2,3,count%255)
+
+    elif field_type in ["ts", "tstxt"]:
+        return_val = int(count * 1000) if field_type == "ts" else\
+        			 datetime.datetime.fromtimestamp(count)\
+        			 .strftime("%Y-%m-%dT%H:%M:%S.000-0000")
+
+    elif field_type == "words":
+        return_val = field_name + str(count)
+
+    elif field_type == "dict":
+        mydict = dict(a=field_name + str(count), b=field_name + str(count), c=field_name + str(count),
+                      d=field_name + str(count), e=field_name + str(count), f=field_name + str(count),
+                      g=field_name + str(count), h=field_name + str(count), i=field_name + str(count), 
+                      j=field_name + str(count))
+        return_val = ", ".join("=".join(_) for _ in mydict.items())
+
+    elif field_type == "text":
+        return_val = field_name + str(count)
+
+    return field_name, return_val
+
+
+def generate_count(min, max):
+    if min == max:
+        return max
+    elif min > max:
+        return random.randrange(max, min);
+    else:
+        return random.randrange(min, max);
+
+
+def generate_random_doc(format,count):
+    global id_counter
+
+    res = {}
+
+    for f in format:
+        f_key, f_val = get_data_for_format(f,count)
+        if f_key:
+            res[f_key] = f_val
+
+    if not tornado.options.options.id_type:
+        return res
+
+    if tornado.options.options.id_type == 'int':
+        res['_id'] = id_counter
+        id_counter += 1
+    elif tornado.options.options.id_type == 'uuid4':
+        res['_id'] = str(uuid.uuid4())
+
+    return res
+
+
+def set_index_refresh(val):
+
+    params = {"index": {"refresh_interval": val}}
+    body = json.dumps(params)
+    url = "%s/%s/_settings" % (tornado.options.options.es_url, tornado.options.options.index_name)
+    try:
+        request = tornado.httpclient.HTTPRequest(url, method="PUT", body=body, request_timeout=240,
+                                                 auth_username=tornado.options.options.username, 
+                                                 auth_password=tornado.options.options.password)
+        http_client = tornado.httpclient.HTTPClient()
+        http_client.fetch(request)
+        logging.info('Set index refresh to %s' % val)
+    except Exception as ex:
+        logging.exception(ex)
+
+
+@tornado.gen.coroutine
+def generate_test_data():
+
+    global upload_data_count
+
+    if tornado.options.options.force_init_index:
+        delete_index(tornado.options.options.index_name)
+
+    create_index(tornado.options.options.index_name)
+
+    # todo: query what refresh is set to, then restore later
+    if tornado.options.options.set_refresh:
+        set_index_refresh("-1")
+
+    if tornado.options.options.out_file:
+        out_file = open(tornado.options.options.out_file, "w")
+    else:
+        out_file = None
+
+    if tornado.options.options.dict_file:
+        global _dict_data
+        with open(tornado.options.options.dict_file, 'r') as f:
+            _dict_data = f.readlines()
+        logging.info("Loaded %d words from the %s" % (len(_dict_data), 
+                                                      tornado.options.options.dict_file))
+
+    format = tornado.options.options.format.split(',')
+    if not format:
+        logging.error('invalid format')
+        exit(1)
+
+    ts_start = int(time.time())
+    upload_data_txt = ""
+    total_uploaded = 0
+
+    logging.info("Generating %d docs, upload batch size is %d" % (tornado.options.options.count,
+                                                                  tornado.options
+                                                                  .options.batch_size))
+    for num in range(0, tornado.options.options.count):
+
+        item = generate_random_doc(format,num)
+
+        if out_file:
+            out_file.write("%s\n" % json.dumps(item))
+
+        cmd = {'index': {'_index': tornado.options.options.index_name,
+                         '_type': tornado.options.options.index_type}}
+        if '_id' in item:
+            cmd['index']['_id'] = item['_id']
+
+        upload_data_txt += json.dumps(cmd) + "\n"
+        upload_data_txt += json.dumps(item) + "\n"
+        upload_data_count += 1
+
+        if upload_data_count % tornado.options.options.batch_size == 0:
+            yield upload_batch(upload_data_txt)
+            upload_data_txt = ""
+
+    # upload remaining items in `upload_data_txt`
+    if upload_data_txt:
+        yield upload_batch(upload_data_txt)
+
+    if tornado.options.options.set_refresh:
+        set_index_refresh("1s")
+
+    if out_file:
+        out_file.close()
+
+    took_secs = int(time.time() - ts_start)
+
+    logging.info("Done - total docs uploaded: %d, took %d seconds" % 
+    			 (tornado.options.options.count, took_secs))
+
+
+if __name__ == '__main__':
+    tornado.options.define("es_url", type=str, default='http://localhost:9200/', 
+                           help="URL of your Elasticsearch node")
+    tornado.options.define("index_name", type=str, default='test_data', 
+                           help="Name of the index to store your messages")
+    tornado.options.define("index_type", type=str, default='test_type', help="Type")
+    tornado.options.define("batch_size", type=int, default=1000, 
+                           help="Elasticsearch bulk index batch size")
+    tornado.options.define("num_of_shards", type=int, default=2, 
+                           help="Number of shards for ES index")
+    tornado.options.define("http_upload_timeout", type=int, default=3, 
+                           help="Timeout in seconds when uploading data")
+    tornado.options.define("count", type=int, default=100000, help="Number of docs to generate")
+    tornado.options.define("format", type=str, default='name:str,age:int,last_updated:ts', 
+                           help="message format")
+    tornado.options.define("num_of_replicas", type=int, default=0, 
+                           help="Number of replicas for ES index")
+    tornado.options.define("force_init_index", type=bool, default=False, 
+                           help="Force deleting and re-initializing the Elasticsearch index")
+    tornado.options.define("set_refresh", type=bool, default=False, 
+                           help="Set refresh rate to -1 before starting the upload")
+    tornado.options.define("out_file", type=str, default=False, 
+                           help="If set, write test data to out_file as well.")
+    tornado.options.define("id_type", type=str, default=None, 
+                           help="Type of 'id' to use for the docs, \
+                           valid settings are int and uuid4, None is default")
+    tornado.options.define("dict_file", type=str, default=None, 
+                           help="Name of dictionary file to use")
+    tornado.options.define("username", type=str, default=None, help="Username for elasticsearch")
+    tornado.options.define("password", type=str, default=None, help="Password for elasticsearch")
+    tornado.options.parse_command_line()
+
+    tornado.ioloop.IOLoop.instance().run_sync(generate_test_data)

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/elasticsearch/show-health.sh
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/elasticsearch/show-health.sh b/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/elasticsearch/show-health.sh
new file mode 100644
index 0000000..8fa912c
--- /dev/null
+++ b/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/elasticsearch/show-health.sh
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/bin/sh
+set -e
+
+external_ip="$(kubectl get svc elasticsearch -o jsonpath='{.status.loadBalancer.ingress[0].ip}')"
+
+echo "Elasticsearch cluster health info"
+echo "---------------------------------"
+curl $external_ip:9200/_cluster/health
+echo # empty line since curl doesn't output CRLF
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop/pom.xml
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop/pom.xml b/sdks/java/io/hadoop/pom.xml
new file mode 100644
index 0000000..1982c25
--- /dev/null
+++ b/sdks/java/io/hadoop/pom.xml
@@ -0,0 +1,53 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    (the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.beam</groupId>
+    <artifactId>beam-sdks-java-io-parent</artifactId>
+    <version>0.7.0-SNAPSHOT</version>
+    <relativePath>../pom.xml</relativePath>
+  </parent>
+  <packaging>pom</packaging>
+  <artifactId>beam-sdks-java-io-hadoop-parent</artifactId>
+  <name>Apache Beam :: SDKs :: Java :: IO :: Hadoop</name>
+  <description>Parent for Beam SDK Hadoop IO which reads data from any source which implements Hadoop Input Format.</description>
+
+  <modules>
+    <module>jdk1.8-tests</module>
+    <module>input-format</module>
+  </modules>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.beam</groupId>
+      <artifactId>beam-sdks-java-core</artifactId>
+    </dependency>
+
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.mockito</groupId>
+      <artifactId>mockito-all</artifactId>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+</project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/pom.xml
----------------------------------------------------------------------
diff --git a/sdks/java/io/pom.xml b/sdks/java/io/pom.xml
index 73fbba1..27fc614 100644
--- a/sdks/java/io/pom.xml
+++ b/sdks/java/io/pom.xml
@@ -68,6 +68,7 @@
     <module>elasticsearch</module>
     <module>google-cloud-platform</module>
     <module>hadoop-common</module>
+    <module>hadoop</module>
     <module>hbase</module>
     <module>hdfs</module>
     <module>jdbc</module>
@@ -114,5 +115,4 @@
       </properties>
     </profile>
   </profiles>
-
 </project>

[4/7] beam git commit: HadoopInputFormatIO with junits

Posted by da...@apache.org.

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop/input-format/src/main/java/org/apache/beam/sdk/io/hadoop/inputformat/HadoopInputFormatIO.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop/input-format/src/main/java/org/apache/beam/sdk/io/hadoop/inputformat/HadoopInputFormatIO.java b/sdks/java/io/hadoop/input-format/src/main/java/org/apache/beam/sdk/io/hadoop/inputformat/HadoopInputFormatIO.java
new file mode 100644
index 0000000..675f4bf
--- /dev/null
+++ b/sdks/java/io/hadoop/input-format/src/main/java/org/apache/beam/sdk/io/hadoop/inputformat/HadoopInputFormatIO.java
@@ -0,0 +1,842 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package org.apache.beam.sdk.io.hadoop.inputformat;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkNotNull;
+
+import com.google.auto.value.AutoValue;
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Function;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Lists;
+import com.google.common.util.concurrent.AtomicDouble;
+
+import java.io.Externalizable;
+import java.io.IOException;
+import java.io.ObjectInput;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutput;
+import java.io.ObjectOutputStream;
+import java.io.Serializable;
+import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map.Entry;
+import java.util.Set;
+import java.util.concurrent.atomic.AtomicLong;
+
+import javax.annotation.Nullable;
+
+import org.apache.beam.sdk.coders.CannotProvideCoderException;
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.coders.CoderException;
+import org.apache.beam.sdk.coders.CoderRegistry;
+import org.apache.beam.sdk.coders.KvCoder;
+import org.apache.beam.sdk.io.BoundedSource;
+import org.apache.beam.sdk.io.hadoop.WritableCoder;
+import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.sdk.transforms.PTransform;
+import org.apache.beam.sdk.transforms.SimpleFunction;
+import org.apache.beam.sdk.transforms.display.DisplayData;
+import org.apache.beam.sdk.util.CoderUtils;
+import org.apache.beam.sdk.values.KV;
+import org.apache.beam.sdk.values.PBegin;
+import org.apache.beam.sdk.values.PCollection;
+import org.apache.beam.sdk.values.TypeDescriptor;
+import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.ObjectWritable;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.InputFormat;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.TaskAttemptID;
+import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * A {@link HadoopInputFormatIO} is a Transform for reading data from any source which
+ * implements Hadoop {@link InputFormat}. For example- Cassandra, Elasticsearch, HBase, Redis,
+ * Postgres etc. {@link HadoopInputFormatIO} has to make several performance trade-offs in
+ * connecting to {@link InputFormat}, so if there is another Beam IO Transform specifically for
+ * connecting to your data source of choice, we would recommend using that one, but this IO
+ * Transform allows you to connect to many data sources that do not yet have a Beam IO Transform.
+ *
+ * <p>You will need to pass a Hadoop {@link Configuration} with parameters specifying how the read
+ * will occur. Many properties of the Configuration are optional, and some are required for certain
+ * {@link InputFormat} classes, but the following properties must be set for all InputFormats:
+ * <ul>
+ * <li>{@code mapreduce.job.inputformat.class}: The {@link InputFormat} class used to connect to
+ * your data source of choice.</li>
+ * <li>{@code key.class}: The key class returned by the {@link InputFormat} in
+ * {@code mapreduce.job.inputformat.class}.</li>
+ * <li>{@code value.class}: The value class returned by the {@link InputFormat} in
+ * {@code mapreduce.job.inputformat.class}.</li>
+ * </ul>
+ * For example:
+ *
+ * <pre>
+ * {
+ *   Configuration myHadoopConfiguration = new Configuration(false);
+ *   // Set Hadoop InputFormat, key and value class in configuration
+ *   myHadoopConfiguration.setClass(&quot;mapreduce.job.inputformat.class&quot;,
+ *      MyDbInputFormatClass, InputFormat.class);
+ *   myHadoopConfiguration.setClass(&quot;key.class&quot;, MyDbInputFormatKeyClass, Object.class);
+ *   myHadoopConfiguration.setClass(&quot;value.class&quot;,
+ *      MyDbInputFormatValueClass, Object.class);
+ * }
+ * </pre>
+ *
+ * <p>You will need to check to see if the key and value classes output by the {@link InputFormat}
+ * have a Beam {@link Coder} available. If not, you can use withKeyTranslation/withValueTranslation
+ * to specify a method transforming instances of those classes into another class that is supported
+ * by a Beam {@link Coder}. These settings are optional and you don't need to specify translation
+ * for both key and value. If you specify a translation, you will need to make sure the K or V of
+ * the read transform match the output type of the translation.
+ *
+ * <p>You will need to set appropriate InputFormat key and value class (i.e. "key.class" and
+ * "value.class") in Hadoop {@link Configuration}. If you set different InputFormat key or
+ * value class than InputFormat's actual key or value class then, it may result in an error like
+ * "unexpected extra bytes after decoding" while the decoding process of key/value object happens.
+ * Hence, it is important to set appropriate InputFormat key and value class.
+ *
+ * <h3>Reading using {@link HadoopInputFormatIO}</h3>
+ *
+ * <pre>
+ * {@code
+ * Pipeline p = ...; // Create pipeline.
+ * // Read data only with Hadoop configuration.
+ * p.apply("read",
+ *     HadoopInputFormatIO.<InputFormatKeyClass, InputFormatKeyClass>read()
+ *              .withConfiguration(myHadoopConfiguration);
+ * }
+ * // Read data with configuration and key translation (Example scenario: Beam Coder is not
+ * available for key class hence key translation is required.).
+ * SimpleFunction&lt;InputFormatKeyClass, MyKeyClass&gt; myOutputKeyType =
+ *       new SimpleFunction&lt;InputFormatKeyClass, MyKeyClass&gt;() {
+ *         public MyKeyClass apply(InputFormatKeyClass input) {
+ *           // ...logic to transform InputFormatKeyClass to MyKeyClass
+ *         }
+ * };
+ * </pre>
+ *
+ * <pre>
+ * {@code
+ * p.apply("read",
+ *     HadoopInputFormatIO.<MyKeyClass, InputFormatKeyClass>read()
+ *              .withConfiguration(myHadoopConfiguration)
+ *              .withKeyTranslation(myOutputKeyType);
+ * }
+ * </pre>
+ *
+ * <p>// Read data with configuration and value translation (Example scenario: Beam Coder is not
+ * available for value class hence value translation is required.).
+ *
+ * <pre>
+ * {@code
+ * SimpleFunction&lt;InputFormatValueClass, MyValueClass&gt; myOutputValueType =
+ *      new SimpleFunction&lt;InputFormatValueClass, MyValueClass&gt;() {
+ *          public MyValueClass apply(InputFormatValueClass input) {
+ *            // ...logic to transform InputFormatValueClass to MyValueClass
+ *          }
+ *  };
+ * }
+ * </pre>
+ *
+ * <pre>
+ * {@code
+ * p.apply("read",
+ *     HadoopInputFormatIO.<InputFormatKeyClass, MyValueClass>read()
+ *              .withConfiguration(myHadoopConfiguration)
+ *              .withValueTranslation(myOutputValueType);
+ * }
+ * </pre>
+ */
+
+public class HadoopInputFormatIO {
+  private static final Logger LOG = LoggerFactory.getLogger(HadoopInputFormatIO.class);
+
+  /**
+   * Creates an uninitialized {@link HadoopInputFormatIO.Read}. Before use, the {@code Read} must
+   * be initialized with a HadoopInputFormatIO.Read#withConfiguration(HadoopConfiguration) that
+   * specifies the source. A key/value translation may also optionally be specified using
+   * {@link HadoopInputFormatIO.Read#withKeyTranslation}/
+   * {@link HadoopInputFormatIO.Read#withValueTranslation}.
+   */
+  public static <K, V> Read<K, V> read() {
+    return new AutoValue_HadoopInputFormatIO_Read.Builder<K, V>().build();
+  }
+
+  /**
+   * A {@link PTransform} that reads from any data source which implements Hadoop InputFormat. For
+   * e.g. Cassandra, Elasticsearch, HBase, Redis, Postgres, etc. See the class-level Javadoc on
+   * {@link HadoopInputFormatIO} for more information.
+   * @param <K> Type of keys to be read.
+   * @param <V> Type of values to be read.
+   * @see HadoopInputFormatIO
+   */
+  @AutoValue
+  public abstract static class Read<K, V> extends PTransform<PBegin, PCollection<KV<K, V>>> {
+
+    // Returns the Hadoop Configuration which contains specification of source.
+    @Nullable
+    public abstract SerializableConfiguration getConfiguration();
+
+    @Nullable public abstract SimpleFunction<?, K> getKeyTranslationFunction();
+    @Nullable public abstract SimpleFunction<?, V> getValueTranslationFunction();
+    @Nullable public abstract TypeDescriptor<K> getKeyTypeDescriptor();
+    @Nullable public abstract TypeDescriptor<V> getValueTypeDescriptor();
+    @Nullable public abstract TypeDescriptor<?> getinputFormatClass();
+    @Nullable public abstract TypeDescriptor<?> getinputFormatKeyClass();
+    @Nullable public abstract TypeDescriptor<?> getinputFormatValueClass();
+
+    abstract Builder<K, V> toBuilder();
+
+    @AutoValue.Builder
+    abstract static class Builder<K, V> {
+      abstract Builder<K, V> setConfiguration(SerializableConfiguration configuration);
+      abstract Builder<K, V> setKeyTranslationFunction(SimpleFunction<?, K> function);
+      abstract Builder<K, V> setValueTranslationFunction(SimpleFunction<?, V> function);
+      abstract Builder<K, V> setKeyTypeDescriptor(TypeDescriptor<K> keyTypeDescriptor);
+      abstract Builder<K, V> setValueTypeDescriptor(TypeDescriptor<V> valueTypeDescriptor);
+      abstract Builder<K, V> setInputFormatClass(TypeDescriptor<?> inputFormatClass);
+      abstract Builder<K, V> setInputFormatKeyClass(TypeDescriptor<?> inputFormatKeyClass);
+      abstract Builder<K, V> setInputFormatValueClass(TypeDescriptor<?> inputFormatValueClass);
+      abstract Read<K, V> build();
+    }
+
+    /**
+     * Returns a new {@link HadoopInputFormatIO.Read} that will read from the source using the
+     * options provided by the given configuration.
+     *
+     * <p>Does not modify this object.
+     */
+    public Read<K, V> withConfiguration(Configuration configuration) {
+      validateConfiguration(configuration);
+      TypeDescriptor<?> inputFormatClass =
+          TypeDescriptor.of(configuration.getClass("mapreduce.job.inputformat.class", null));
+      TypeDescriptor<?> inputFormatKeyClass =
+          TypeDescriptor.of(configuration.getClass("key.class", null));
+      TypeDescriptor<?> inputFormatValueClass =
+          TypeDescriptor.of(configuration.getClass("value.class", null));
+      Builder<K, V> builder =
+          toBuilder().setConfiguration(new SerializableConfiguration(configuration));
+      builder.setInputFormatClass(inputFormatClass);
+      builder.setInputFormatKeyClass(inputFormatKeyClass);
+      builder.setInputFormatValueClass(inputFormatValueClass);
+      /*
+       * Sets the output key class to InputFormat key class if withKeyTranslation() is not called
+       * yet.
+       */
+      if (getKeyTranslationFunction() == null) {
+        builder.setKeyTypeDescriptor((TypeDescriptor<K>) inputFormatKeyClass);
+      }
+      /*
+       * Sets the output value class to InputFormat value class if withValueTranslation() is not
+       * called yet.
+       */
+      if (getValueTranslationFunction() == null) {
+        builder.setValueTypeDescriptor((TypeDescriptor<V>) inputFormatValueClass);
+      }
+      return builder.build();
+    }
+
+    /**
+     * Returns a new {@link HadoopInputFormatIO.Read} that will transform the keys read from the
+     * source using the given key translation function.
+     *
+     * <p>Does not modify this object.
+     */
+    public Read<K, V> withKeyTranslation(SimpleFunction<?, K> function) {
+      checkNotNull(function, "function");
+      // Sets key class to key translation function's output class type.
+      return toBuilder().setKeyTranslationFunction(function)
+          .setKeyTypeDescriptor((TypeDescriptor<K>) function.getOutputTypeDescriptor()).build();
+    }
+
+    /**
+     * Returns a new {@link HadoopInputFormatIO.Read} that will transform the values read from the
+     * source using the given value translation function.
+     *
+     * <p>Does not modify this object.
+     */
+    public Read<K, V> withValueTranslation(SimpleFunction<?, V> function) {
+      checkNotNull(function, "function");
+      // Sets value class to value translation function's output class type.
+      return toBuilder().setValueTranslationFunction(function)
+          .setValueTypeDescriptor((TypeDescriptor<V>) function.getOutputTypeDescriptor()).build();
+    }
+
+    @Override
+    public PCollection<KV<K, V>> expand(PBegin input) {
+      // Get the key and value coders based on the key and value classes.
+      CoderRegistry coderRegistry = input.getPipeline().getCoderRegistry();
+      Coder<K> keyCoder = getDefaultCoder(getKeyTypeDescriptor(), coderRegistry);
+      Coder<V> valueCoder = getDefaultCoder(getValueTypeDescriptor(), coderRegistry);
+      HadoopInputFormatBoundedSource<K, V> source = new HadoopInputFormatBoundedSource<K, V>(
+          getConfiguration(),
+          keyCoder,
+          valueCoder,
+          getKeyTranslationFunction(),
+          getValueTranslationFunction());
+      return input.getPipeline().apply(org.apache.beam.sdk.io.Read.from(source));
+    }
+
+    /**
+     * Validates that the mandatory configuration properties such as InputFormat class, InputFormat
+     * key and value classes are provided in the Hadoop configuration.
+     */
+    private void validateConfiguration(Configuration configuration) {
+      checkNotNull(configuration, "configuration");
+      checkNotNull(configuration.get("mapreduce.job.inputformat.class"),
+          "configuration.get(\"mapreduce.job.inputformat.class\")");
+      checkNotNull(configuration.get("key.class"), "configuration.get(\"key.class\")");
+      checkNotNull(configuration.get("value.class"),
+          "configuration.get(\"value.class\")");
+    }
+
+    /**
+     * Validates inputs provided by the pipeline user before reading the data.
+     */
+    @Override
+    public void validate(PBegin input) {
+      checkNotNull(getConfiguration(), "getConfiguration()");
+      // Validate that the key translation input type must be same as key class of InputFormat.
+      validateTranslationFunction(getinputFormatKeyClass(), getKeyTranslationFunction(),
+          "Key translation's input type is not same as hadoop InputFormat : %s key class : %s");
+      // Validate that the value translation input type must be same as value class of InputFormat.
+      validateTranslationFunction(getinputFormatValueClass(), getValueTranslationFunction(),
+          "Value translation's input type is not same as hadoop InputFormat :  "
+              + "%s value class : %s");
+    }
+
+    /**
+     * Validates translation function given for key/value translation.
+     */
+    private void validateTranslationFunction(TypeDescriptor<?> inputType,
+        SimpleFunction<?, ?> simpleFunction, String errorMsg) {
+      if (simpleFunction != null) {
+        if (!simpleFunction.getInputTypeDescriptor().equals(inputType)) {
+          throw new IllegalArgumentException(
+              String.format(errorMsg, getinputFormatClass().getRawType(), inputType.getRawType()));
+        }
+      }
+    }
+
+    /**
+     * Returns the default coder for a given type descriptor. Coder Registry is queried for correct
+     * coder, if not found in Coder Registry, then check if the type descriptor provided is of type
+     * Writable, then WritableCoder is returned, else exception is thrown "Cannot find coder".
+     */
+    public <T> Coder<T> getDefaultCoder(TypeDescriptor<?> typeDesc, CoderRegistry coderRegistry) {
+      Class classType = typeDesc.getRawType();
+      try {
+        return (Coder<T>) coderRegistry.getCoder(typeDesc);
+      } catch (CannotProvideCoderException e) {
+        if (Writable.class.isAssignableFrom(classType)) {
+          return (Coder<T>) WritableCoder.of(classType);
+        }
+        throw new IllegalStateException(String.format("Cannot find coder for %s  : ", typeDesc)
+            + e.getMessage(), e);
+      }
+    }
+
+    @Override
+    public void populateDisplayData(DisplayData.Builder builder) {
+      super.populateDisplayData(builder);
+      if (getConfiguration().getHadoopConfiguration() != null) {
+        Iterator<Entry<String, String>> configProperties = getConfiguration()
+            .getHadoopConfiguration().iterator();
+        while (configProperties.hasNext()) {
+          Entry<String, String> property = configProperties.next();
+          builder.addIfNotNull(DisplayData.item(property.getKey(), property.getValue())
+              .withLabel(property.getKey()));
+        }
+      }
+    }
+  }
+
+  /**
+   * Bounded source implementation for {@link HadoopInputFormatIO}.
+   * @param <K> Type of keys to be read.
+   * @param <V> Type of values to be read.
+   */
+  public static class HadoopInputFormatBoundedSource<K, V> extends BoundedSource<KV<K, V>>
+      implements Serializable {
+    private final SerializableConfiguration conf;
+    private final Coder<K> keyCoder;
+    private final Coder<V> valueCoder;
+    @Nullable private final SimpleFunction<?, K> keyTranslationFunction;
+    @Nullable private final SimpleFunction<?, V> valueTranslationFunction;
+    private final SerializableSplit inputSplit;
+    private transient List<SerializableSplit> inputSplits;
+    private long boundedSourceEstimatedSize = 0;
+    private transient InputFormat<?, ?> inputFormatObj;
+    private transient TaskAttemptContext taskAttemptContext;
+    private static final Set<Class<?>> immutableTypes = new HashSet<Class<?>>(
+        Arrays.asList(
+            String.class,
+            Byte.class,
+            Short.class,
+            Integer.class,
+            Long.class,
+            Float.class,
+            Double.class,
+            Boolean.class,
+            BigInteger.class,
+            BigDecimal.class));
+
+    HadoopInputFormatBoundedSource(
+        SerializableConfiguration conf,
+        Coder<K> keyCoder,
+        Coder<V> valueCoder,
+        @Nullable SimpleFunction<?, K> keyTranslationFunction,
+        @Nullable SimpleFunction<?, V> valueTranslationFunction) {
+      this(conf,
+          keyCoder,
+          valueCoder,
+          keyTranslationFunction,
+          valueTranslationFunction,
+          null);
+    }
+
+    protected HadoopInputFormatBoundedSource(
+        SerializableConfiguration conf,
+        Coder<K> keyCoder,
+        Coder<V> valueCoder,
+        @Nullable SimpleFunction<?, K> keyTranslationFunction,
+        @Nullable SimpleFunction<?, V> valueTranslationFunction,
+        SerializableSplit inputSplit) {
+      this.conf = conf;
+      this.inputSplit = inputSplit;
+      this.keyCoder = keyCoder;
+      this.valueCoder = valueCoder;
+      this.keyTranslationFunction = keyTranslationFunction;
+      this.valueTranslationFunction = valueTranslationFunction;
+    }
+
+    public SerializableConfiguration getConfiguration() {
+      return conf;
+    }
+
+    @Override
+    public void validate() {
+      checkNotNull(conf, "conf");
+      checkNotNull(keyCoder, "keyCoder");
+      checkNotNull(valueCoder, "valueCoder");
+    }
+
+    @Override
+    public List<BoundedSource<KV<K, V>>> splitIntoBundles(long desiredBundleSizeBytes,
+        PipelineOptions options) throws Exception {
+      // desiredBundleSizeBytes is not being considered as splitting based on this
+      // value is not supported by inputFormat getSplits() method.
+      if (inputSplit != null) {
+        LOG.info("Not splitting source {} because source is already split.", this);
+        return ImmutableList.of((BoundedSource<KV<K, V>>) this);
+      }
+      computeSplitsIfNecessary();
+      LOG.info("Generated {} splits. Size of first split is {} ", inputSplits.size(), inputSplits
+          .get(0).getSplit().getLength());
+      return Lists.transform(inputSplits,
+          new Function<SerializableSplit, BoundedSource<KV<K, V>>>() {
+            @Override
+            public BoundedSource<KV<K, V>> apply(SerializableSplit serializableInputSplit) {
+              HadoopInputFormatBoundedSource<K, V> hifBoundedSource =
+                  new HadoopInputFormatBoundedSource<K, V>(conf, keyCoder, valueCoder,
+                      keyTranslationFunction, valueTranslationFunction, serializableInputSplit);
+              return hifBoundedSource;
+            }
+          });
+    }
+
+    @Override
+    public long getEstimatedSizeBytes(PipelineOptions po) throws Exception {
+      if (inputSplit == null) {
+        // If there are no splits computed yet, then retrieve the splits.
+        computeSplitsIfNecessary();
+        return boundedSourceEstimatedSize;
+      }
+      return inputSplit.getSplit().getLength();
+    }
+
+    /**
+     * This is a helper function to compute splits. This method will also calculate size of the
+     * data being read. Note: This method is executed exactly once and the splits are retrieved
+     * and cached in this. These splits are further used by splitIntoBundles() and
+     * getEstimatedSizeBytes().
+     */
+    @VisibleForTesting
+    void computeSplitsIfNecessary() throws IOException, InterruptedException {
+      if (inputSplits != null) {
+        return;
+      }
+      createInputFormatInstance();
+      List<InputSplit> splits =
+          inputFormatObj.getSplits(Job.getInstance(conf.getHadoopConfiguration()));
+      if (splits == null) {
+        throw new IOException("Error in computing splits, getSplits() returns null.");
+      }
+      if (splits.isEmpty()) {
+        throw new IOException("Error in computing splits, getSplits() returns a empty list");
+      }
+      boundedSourceEstimatedSize = 0;
+      inputSplits = new ArrayList<SerializableSplit>();
+      for (InputSplit inputSplit : splits) {
+        if (inputSplit == null) {
+          throw new IOException("Error in computing splits, split is null in InputSplits list "
+              + "populated by getSplits() : ");
+        }
+        boundedSourceEstimatedSize += inputSplit.getLength();
+        inputSplits.add(new SerializableSplit(inputSplit));
+      }
+    }
+
+    /**
+     * Creates instance of InputFormat class. The InputFormat class name is specified in the Hadoop
+     * configuration.
+     */
+    protected void createInputFormatInstance() throws IOException {
+      if (inputFormatObj == null) {
+        try {
+          taskAttemptContext =
+              new TaskAttemptContextImpl(conf.getHadoopConfiguration(), new TaskAttemptID());
+          inputFormatObj =
+              (InputFormat<?, ?>) conf
+                  .getHadoopConfiguration()
+                  .getClassByName(
+                      conf.getHadoopConfiguration().get("mapreduce.job.inputformat.class"))
+                  .newInstance();
+          /*
+           * If InputFormat explicitly implements interface {@link Configurable}, then setConf()
+           * method of {@link Configurable} needs to be explicitly called to set all the
+           * configuration parameters. For example: InputFormat classes which implement Configurable
+           * are {@link org.apache.hadoop.mapreduce.lib.db.DBInputFormat DBInputFormat}, {@link
+           * org.apache.hadoop.hbase.mapreduce.TableInputFormat TableInputFormat}, etc.
+           */
+          if (Configurable.class.isAssignableFrom(inputFormatObj.getClass())) {
+            ((Configurable) inputFormatObj).setConf(conf.getHadoopConfiguration());
+          }
+        } catch (InstantiationException | IllegalAccessException | ClassNotFoundException e) {
+          throw new IOException("Unable to create InputFormat object: ", e);
+        }
+      }
+    }
+
+    @VisibleForTesting
+    InputFormat<?, ?> getInputFormat(){
+      return inputFormatObj;
+    }
+
+    @VisibleForTesting
+    void setInputFormatObj(InputFormat<?, ?> inputFormatObj) {
+      this.inputFormatObj = inputFormatObj;
+    }
+
+    @Override
+    public Coder<KV<K, V>> getDefaultOutputCoder() {
+      return KvCoder.of(keyCoder, valueCoder);
+    }
+
+    @Override
+    public BoundedReader<KV<K, V>> createReader(PipelineOptions options) throws IOException {
+      this.validate();
+      if (inputSplit == null) {
+        throw new IOException("Cannot create reader as source is not split yet.");
+      } else {
+        createInputFormatInstance();
+        return new HadoopInputFormatReader<>(
+            this,
+            keyTranslationFunction,
+            valueTranslationFunction,
+            inputSplit,
+            inputFormatObj,
+            taskAttemptContext);
+      }
+    }
+
+    /**
+     * BoundedReader for Hadoop InputFormat source.
+     *
+     * @param <K> Type of keys RecordReader emits.
+     * @param <V> Type of values RecordReader emits.
+     */
+    class HadoopInputFormatReader<T1, T2> extends BoundedSource.BoundedReader<KV<K, V>> {
+
+      private final HadoopInputFormatBoundedSource<K, V> source;
+      @Nullable private final SimpleFunction<T1, K> keyTranslationFunction;
+      @Nullable private final SimpleFunction<T2, V> valueTranslationFunction;
+      private final SerializableSplit split;
+      private RecordReader<T1, T2> recordReader;
+      private volatile boolean doneReading = false;
+      private AtomicLong recordsReturned = new AtomicLong();
+      // Tracks the progress of the RecordReader.
+      private AtomicDouble progressValue = new AtomicDouble();
+      private transient InputFormat<T1, T2> inputFormatObj;
+      private transient TaskAttemptContext taskAttemptContext;
+
+      private HadoopInputFormatReader(HadoopInputFormatBoundedSource<K, V> source,
+          @Nullable SimpleFunction keyTranslationFunction,
+          @Nullable SimpleFunction valueTranslationFunction,
+          SerializableSplit split,
+          InputFormat inputFormatObj,
+          TaskAttemptContext taskAttemptContext) {
+        this.source = source;
+        this.keyTranslationFunction = keyTranslationFunction;
+        this.valueTranslationFunction = valueTranslationFunction;
+        this.split = split;
+        this.inputFormatObj = inputFormatObj;
+        this.taskAttemptContext = taskAttemptContext;
+      }
+
+      @Override
+      public HadoopInputFormatBoundedSource<K, V> getCurrentSource() {
+        return source;
+      }
+
+      @Override
+      public boolean start() throws IOException {
+        try {
+          recordsReturned.set(0L);
+          recordReader =
+              (RecordReader<T1, T2>) inputFormatObj.createRecordReader(split.getSplit(),
+                  taskAttemptContext);
+          if (recordReader != null) {
+            recordReader.initialize(split.getSplit(), taskAttemptContext);
+            progressValue.set(getProgress());
+            if (recordReader.nextKeyValue()) {
+              recordsReturned.incrementAndGet();
+              doneReading = false;
+              return true;
+            }
+          } else {
+            throw new IOException(String.format("Null RecordReader object returned by %s",
+                inputFormatObj.getClass()));
+          }
+          recordReader = null;
+        } catch (InterruptedException e) {
+          throw new IOException(
+              "Could not read because the thread got interrupted while "
+              + "reading the records with an exception: ",
+              e);
+        }
+        doneReading = true;
+        return false;
+      }
+
+      @Override
+      public boolean advance() throws IOException {
+        try {
+          progressValue.set(getProgress());
+          if (recordReader.nextKeyValue()) {
+            recordsReturned.incrementAndGet();
+            return true;
+          }
+          doneReading = true;
+        } catch (InterruptedException e) {
+          throw new IOException("Unable to read data: ", e);
+        }
+        return false;
+      }
+
+      @Override
+      public KV<K, V> getCurrent() {
+        K key = null;
+        V value = null;
+        try {
+          // Transform key if translation function is provided.
+          key =
+              transformKeyOrValue((T1) recordReader.getCurrentKey(), keyTranslationFunction,
+                  keyCoder);
+          // Transform value if translation function is provided.
+          value =
+              transformKeyOrValue((T2) recordReader.getCurrentValue(), valueTranslationFunction,
+                  valueCoder);
+        } catch (IOException | InterruptedException e) {
+          LOG.error("Unable to read data: " + "{}", e);
+          throw new IllegalStateException("Unable to read data: " + "{}", e);
+        }
+        return KV.of(key, value);
+      }
+
+      /**
+       * Returns the serialized output of transformed key or value object.
+       * @throws ClassCastException
+       * @throws CoderException
+       */
+      private <T, T3> T3 transformKeyOrValue(T input,
+          @Nullable SimpleFunction<T, T3> simpleFunction, Coder<T3> coder) throws CoderException,
+          ClassCastException {
+        T3 output;
+        if (null != simpleFunction) {
+          output = simpleFunction.apply(input);
+        } else {
+          output = (T3) input;
+        }
+        return cloneIfPossiblyMutable((T3) output, coder);
+      }
+
+      /**
+       * Beam expects immutable objects, but the Hadoop InputFormats tend to re-use the same object
+       * when returning them. Hence, mutable objects returned by Hadoop InputFormats are cloned.
+       */
+      private <T> T cloneIfPossiblyMutable(T input, Coder<T> coder) throws CoderException,
+          ClassCastException {
+        // If the input object is not of known immutable type, clone the object.
+        if (!isKnownImmutable(input)) {
+          input = CoderUtils.clone(coder, input);
+        }
+        return input;
+      }
+
+      /**
+       * Utility method to check if the passed object is of a known immutable type.
+       */
+      private boolean isKnownImmutable(Object o) {
+        return immutableTypes.contains(o.getClass());
+      }
+
+      @Override
+      public void close() throws IOException {
+        LOG.info("Closing reader after reading {} records.", recordsReturned);
+        if (recordReader != null) {
+          recordReader.close();
+          recordReader = null;
+        }
+      }
+
+      @Override
+      public Double getFractionConsumed() {
+        if (doneReading) {
+          return 1.0;
+        } else if (recordReader == null || recordsReturned.get() == 0L) {
+          return 0.0;
+        }
+        if (progressValue.get() == 0.0) {
+          return null;
+        }
+        return progressValue.doubleValue();
+      }
+
+      /**
+       * Returns RecordReader's progress.
+       * @throws IOException
+       * @throws InterruptedException
+       */
+      private Double getProgress() throws IOException, InterruptedException {
+        try {
+          float progress = recordReader.getProgress();
+          return (double) progress < 0 || progress > 1 ? 0.0 : progress;
+        } catch (IOException e) {
+          LOG.error(
+              "Error in computing the fractions consumed as RecordReader.getProgress() throws an "
+              + "exception : " + "{}", e);
+          throw new IOException(
+              "Error in computing the fractions consumed as RecordReader.getProgress() throws an "
+              + "exception : " + e.getMessage(), e);
+        }
+      }
+
+      @Override
+      public final long getSplitPointsRemaining() {
+        if (doneReading) {
+          return 0;
+        }
+        /**
+         * This source does not currently support dynamic work rebalancing, so remaining parallelism
+         * is always 1.
+         */
+        return 1;
+      }
+    }
+  }
+
+  /**
+   * A wrapper to allow Hadoop {@link org.apache.hadoop.mapreduce.InputSplit} to be serialized using
+   * Java's standard serialization mechanisms.
+   */
+  public static class SerializableSplit implements Serializable {
+
+    InputSplit inputSplit;
+
+    public SerializableSplit() {}
+
+    public SerializableSplit(InputSplit split) {
+      checkArgument(split instanceof Writable,
+          String.format("Split is not of type Writable: %s", split));
+      this.inputSplit = split;
+    }
+
+    public InputSplit getSplit() {
+      return inputSplit;
+    }
+
+    private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
+      ObjectWritable ow = new ObjectWritable();
+      ow.setConf(new Configuration(false));
+      ow.readFields(in);
+      this.inputSplit = (InputSplit) ow.get();
+    }
+
+    private void writeObject(ObjectOutputStream out) throws IOException {
+      new ObjectWritable(inputSplit).write(out);
+    }
+  }
+
+  /**
+   * A wrapper to allow Hadoop {@link org.apache.hadoop.conf.Configuration} to be serialized using
+   * Java's standard serialization mechanisms. Note that the org.apache.hadoop.conf.Configuration
+   * is Writable.
+   */
+  public static class SerializableConfiguration implements Externalizable {
+
+    private Configuration conf;
+
+    public SerializableConfiguration() {}
+
+    public SerializableConfiguration(Configuration conf) {
+      this.conf = conf;
+    }
+
+    public Configuration getHadoopConfiguration() {
+      return conf;
+    }
+
+    @Override
+    public void writeExternal(ObjectOutput out) throws IOException {
+      out.writeUTF(conf.getClass().getCanonicalName());
+      ((Writable) conf).write(out);
+    }
+
+    @Override
+    public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException {
+      String className = in.readUTF();
+      try {
+        conf = (Configuration) Class.forName(className).newInstance();
+        conf.readFields(in);
+      } catch (InstantiationException | IllegalAccessException e) {
+        throw new IOException("Unable to create configuration: " + e);
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop/input-format/src/main/java/org/apache/beam/sdk/io/hadoop/inputformat/package-info.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop/input-format/src/main/java/org/apache/beam/sdk/io/hadoop/inputformat/package-info.java b/sdks/java/io/hadoop/input-format/src/main/java/org/apache/beam/sdk/io/hadoop/inputformat/package-info.java
new file mode 100644
index 0000000..5488448
--- /dev/null
+++ b/sdks/java/io/hadoop/input-format/src/main/java/org/apache/beam/sdk/io/hadoop/inputformat/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * Defines transforms for reading from Data sources which implement Hadoop Input Format.
+ *
+ * @see org.apache.beam.sdk.io.hadoop.inputformat.HadoopInputFormatIO
+ */
+package org.apache.beam.sdk.io.hadoop.inputformat;

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop/input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/ConfigurableEmployeeInputFormat.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop/input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/ConfigurableEmployeeInputFormat.java b/sdks/java/io/hadoop/input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/ConfigurableEmployeeInputFormat.java
new file mode 100644
index 0000000..40f949b
--- /dev/null
+++ b/sdks/java/io/hadoop/input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/ConfigurableEmployeeInputFormat.java
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package org.apache.beam.sdk.io.hadoop.inputformat;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.InputFormat;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+
+/**
+ * This is a dummy input format to test reading using HadoopInputFormatIO if InputFormat implements
+ * Configurable. This validates if setConf() method is called before getSplits(). Known InputFormats
+ * which implement Configurable are DBInputFormat, TableInputFormat etc.
+ */
+public class ConfigurableEmployeeInputFormat extends InputFormat<Text, Employee> implements
+    Configurable {
+  public boolean isConfSet = false;
+
+  public ConfigurableEmployeeInputFormat() {}
+
+  @Override
+  public Configuration getConf() {
+    return null;
+  }
+
+  /**
+   * Set configuration properties such as number of splits and number of records in each split.
+   */
+  @Override
+  public void setConf(Configuration conf) {
+    isConfSet = true;
+  }
+
+  @Override
+  public RecordReader<Text, Employee> createRecordReader(InputSplit split,
+      TaskAttemptContext context) throws IOException, InterruptedException {
+    return new ConfigurableEmployeeRecordReader();
+  }
+
+  /**
+   * Returns InputSPlit list of {@link ConfigurableEmployeeInputSplit}. Throws exception if
+   * {@link #setConf()} is not called.
+   */
+  @Override
+  public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
+    if (!isConfSet) {
+      throw new IOException("Configuration is not set.");
+    }
+    List<InputSplit> splits = new ArrayList<InputSplit>();
+    splits.add(new ConfigurableEmployeeInputSplit());
+    return splits;
+  }
+
+  /**
+   * InputSplit implementation for ConfigurableEmployeeInputFormat.
+   */
+  public class ConfigurableEmployeeInputSplit extends InputSplit implements Writable {
+
+    @Override
+    public void readFields(DataInput arg0) throws IOException {}
+
+    @Override
+    public void write(DataOutput arg0) throws IOException {}
+
+    @Override
+    public long getLength() throws IOException, InterruptedException {
+      return 0;
+    }
+
+    @Override
+    public String[] getLocations() throws IOException, InterruptedException {
+      return null;
+    }
+  }
+
+  /**
+   * RecordReader for ConfigurableEmployeeInputFormat.
+   */
+  public class ConfigurableEmployeeRecordReader extends RecordReader<Text, Employee> {
+
+    @Override
+    public void initialize(InputSplit paramInputSplit, TaskAttemptContext paramTaskAttemptContext)
+        throws IOException, InterruptedException {}
+
+    @Override
+    public boolean nextKeyValue() throws IOException, InterruptedException {
+      return false;
+    }
+
+    @Override
+    public Text getCurrentKey() throws IOException, InterruptedException {
+      return null;
+    }
+
+    @Override
+    public Employee getCurrentValue() throws IOException, InterruptedException {
+      return null;
+    }
+
+    @Override
+    public float getProgress() throws IOException, InterruptedException {
+      return 0;
+    }
+
+    @Override
+    public void close() throws IOException {}
+  }
+}

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop/input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/Employee.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop/input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/Employee.java b/sdks/java/io/hadoop/input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/Employee.java
new file mode 100644
index 0000000..9d4f293
--- /dev/null
+++ b/sdks/java/io/hadoop/input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/Employee.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package org.apache.beam.sdk.io.hadoop.inputformat;
+
+import org.apache.beam.sdk.coders.AvroCoder;
+import org.apache.beam.sdk.coders.DefaultCoder;
+
+/**
+ * This class is Employee POJO class with properties- employee name and address. Used in
+ * {@linkplain HadoopInputFormatIO} for different unit tests.
+ */
+@DefaultCoder(AvroCoder.class)
+public class Employee {
+  private String empAddress;
+  private String empName;
+
+  /**
+   * Empty constructor required for Avro decoding.
+   */
+  public Employee() {}
+
+  public Employee(String empName, String empAddress) {
+    this.empAddress = empAddress;
+    this.empName = empName;
+  }
+
+  public String getEmpName() {
+    return empName;
+  }
+
+  public void setEmpName(String empName) {
+    this.empName = empName;
+  }
+
+  public String getEmpAddress() {
+    return empAddress;
+  }
+
+  public void setEmpAddress(String empAddress) {
+    this.empAddress = empAddress;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o) {
+      return true;
+    }
+    if (o == null || getClass() != o.getClass()) {
+      return false;
+    }
+
+    Employee employeePojo = (Employee) o;
+
+    if (empName != null ? !empName.equals(employeePojo.empName) : employeePojo.empName != null) {
+      return false;
+    }
+    if (empAddress != null ? !empAddress.equals(employeePojo.empAddress)
+        : employeePojo.empAddress != null) {
+      return false;
+    }
+    return true;
+  }
+
+  @Override
+  public int hashCode() {
+    return 0;
+  }
+
+  @Override
+  public String toString() {
+    return "Employee{" + "Name='" + empName + '\'' + ", Address=" + empAddress + '}';
+  }
+}

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop/input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/EmployeeInputFormat.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop/input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/EmployeeInputFormat.java b/sdks/java/io/hadoop/input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/EmployeeInputFormat.java
new file mode 100644
index 0000000..206f9ab
--- /dev/null
+++ b/sdks/java/io/hadoop/input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/EmployeeInputFormat.java
@@ -0,0 +1,172 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package org.apache.beam.sdk.io.hadoop.inputformat;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.beam.sdk.values.KV;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.InputFormat;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+
+/**
+ * This is a valid InputFormat for reading employee data, available in the form of {@code List<KV>}
+ * as {@linkplain EmployeeRecordReader#employeeDataList employeeDataList} .
+ * {@linkplain EmployeeRecordReader#employeeDataList employeeDataList} is populated using
+ * {@linkplain TestEmployeeDataSet#populateEmployeeData()}.
+ * {@linkplain EmployeeInputFormat} is used to test whether the
+ * {@linkplain HadoopInputFormatIO } source returns immutable records in the scenario when
+ * RecordReader creates new key and value objects every time it reads data.
+ */
+public class EmployeeInputFormat extends InputFormat<Text, Employee> {
+
+  public EmployeeInputFormat() {}
+
+  @Override
+  public RecordReader<Text, Employee> createRecordReader(InputSplit split,
+      TaskAttemptContext context) throws IOException, InterruptedException {
+    return new EmployeeRecordReader();
+  }
+
+  @Override
+  public List<InputSplit> getSplits(JobContext arg0) throws IOException, InterruptedException {
+    List<InputSplit> inputSplitList = new ArrayList<InputSplit>();
+    for (int i = 1; i <= TestEmployeeDataSet.NUMBER_OF_SPLITS; i++) {
+      InputSplit inputSplitObj =
+          new NewObjectsEmployeeInputSplit(
+              ((i - 1) * TestEmployeeDataSet.NUMBER_OF_RECORDS_IN_EACH_SPLIT), (i
+                  * TestEmployeeDataSet.NUMBER_OF_RECORDS_IN_EACH_SPLIT - 1));
+      inputSplitList.add(inputSplitObj);
+    }
+    return inputSplitList;
+  }
+
+  /**
+   * InputSplit implementation for EmployeeInputFormat.
+   */
+  public static class NewObjectsEmployeeInputSplit extends InputSplit implements Writable {
+    // Start and end map index of each split of employeeData.
+    private long startIndex;
+    private long endIndex;
+
+    public NewObjectsEmployeeInputSplit() {}
+
+    public NewObjectsEmployeeInputSplit(long startIndex, long endIndex) {
+      this.startIndex = startIndex;
+      this.endIndex = endIndex;
+    }
+
+    /**
+     * Returns number of records in each split.
+     */
+    @Override
+    public long getLength() throws IOException, InterruptedException {
+      return this.endIndex - this.startIndex + 1;
+    }
+
+    @Override
+    public String[] getLocations() throws IOException, InterruptedException {
+      return null;
+    }
+
+    public long getStartIndex() {
+      return startIndex;
+    }
+
+    public long getEndIndex() {
+      return endIndex;
+    }
+
+    @Override
+    public void readFields(DataInput dataIn) throws IOException {
+      startIndex = dataIn.readLong();
+      endIndex = dataIn.readLong();
+    }
+
+    @Override
+    public void write(DataOutput dataOut) throws IOException {
+      dataOut.writeLong(startIndex);
+      dataOut.writeLong(endIndex);
+    }
+  }
+
+  /**
+   * RecordReader for EmployeeInputFormat.
+   */
+  public class EmployeeRecordReader extends RecordReader<Text, Employee> {
+
+    private NewObjectsEmployeeInputSplit split;
+    private Text currentKey;
+    private Employee currentValue;
+    private long employeeListIndex = 0L;
+    private long recordsRead = 0L;
+    private List<KV<String, String>> employeeDataList;
+
+    public EmployeeRecordReader() {}
+
+    @Override
+    public void close() throws IOException {}
+
+    @Override
+    public Text getCurrentKey() throws IOException, InterruptedException {
+      return currentKey;
+    }
+
+    @Override
+    public Employee getCurrentValue() throws IOException, InterruptedException {
+      return currentValue;
+    }
+
+    @Override
+    public float getProgress() throws IOException, InterruptedException {
+      return (float) recordsRead / split.getLength();
+    }
+
+    @Override
+    public void initialize(InputSplit split, TaskAttemptContext arg1) throws IOException,
+        InterruptedException {
+      this.split = (NewObjectsEmployeeInputSplit) split;
+      employeeListIndex = this.split.getStartIndex() - 1;
+      recordsRead = 0;
+      employeeDataList = TestEmployeeDataSet.populateEmployeeData();
+      currentValue = new Employee(null, null);
+    }
+
+    @Override
+    public boolean nextKeyValue() throws IOException, InterruptedException {
+      if ((recordsRead++) >= split.getLength()) {
+        return false;
+      }
+      employeeListIndex++;
+      KV<String, String> employeeDetails = employeeDataList.get((int) employeeListIndex);
+      String empData[] = employeeDetails.getValue().split("_");
+      /*
+       * New objects must be returned every time for key and value in order to test the scenario as
+       * discussed the in the class' javadoc.
+       */
+      currentKey = new Text(employeeDetails.getKey());
+      currentValue = new Employee(empData[0], empData[1]);
+      return true;
+    }
+  }
+}

[7/7] beam git commit: This closes #2193

Posted by da...@apache.org.

This closes #2193


Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/82694fe7
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/82694fe7
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/82694fe7

Branch: refs/heads/master
Commit: 82694fe72892a2284139f75a76a13b8e1b1ec1bf
Parents: 9c284d6 174436b
Author: Davor Bonaci <da...@google.com>
Authored: Thu Apr 6 16:32:58 2017 +0200
Committer: Davor Bonaci <da...@google.com>
Committed: Thu Apr 6 16:32:58 2017 +0200

----------------------------------------------------------------------
 pom.xml                                         |    6 +
 sdks/java/io/hadoop-input-format/README.md      |  167 ---
 sdks/java/io/hadoop-input-format/pom.xml        |  136 ---
 .../hadoop/inputformat/HadoopInputFormatIO.java |  941 ---------------
 .../sdk/io/hadoop/inputformat/package-info.java |   23 -
 .../ConfigurableEmployeeInputFormat.java        |  131 ---
 .../sdk/io/hadoop/inputformat/Employee.java     |   85 --
 .../hadoop/inputformat/EmployeeInputFormat.java |  172 ---
 .../inputformat/HadoopInputFormatIOTest.java    |  844 --------------
 .../ReuseObjectsEmployeeInputFormat.java        |  176 ---
 .../hadoop/inputformat/TestEmployeeDataSet.java |   76 --
 sdks/java/io/hadoop/README.md                   |  167 +++
 sdks/java/io/hadoop/input-format/pom.xml        |   98 ++
 .../hadoop/inputformat/HadoopInputFormatIO.java |  842 ++++++++++++++
 .../sdk/io/hadoop/inputformat/package-info.java |   23 +
 .../ConfigurableEmployeeInputFormat.java        |  131 +++
 .../sdk/io/hadoop/inputformat/Employee.java     |   85 ++
 .../hadoop/inputformat/EmployeeInputFormat.java |  172 +++
 .../inputformat/HadoopInputFormatIOTest.java    |  797 +++++++++++++
 .../ReuseObjectsEmployeeInputFormat.java        |  176 +++
 .../hadoop/inputformat/TestEmployeeDataSet.java |   76 ++
 sdks/java/io/hadoop/jdk1.8-tests/pom.xml        |  278 +++++
 .../inputformat/HIFIOWithElasticTest.java       |  277 +++++
 .../custom/options/HIFTestOptions.java          |   64 ++
 .../hadoop/inputformat/hashing/HashingFn.java   |  109 ++
 .../integration/tests/HIFIOCassandraIT.java     |  173 +++
 .../integration/tests/HIFIOElasticIT.java       |  215 ++++
 .../src/test/resources/cassandra.yaml           | 1074 ++++++++++++++++++
 .../SmallITCluster/cassandra-svc-rc.yaml        |   88 ++
 .../cassandra/SmallITCluster/start-up.sh        |   21 +
 .../cassandra/SmallITCluster/teardown.sh        |   21 +
 .../kubernetes/cassandra/data-load-setup.sh     |   29 +
 .../resources/kubernetes/cassandra/data-load.sh |   67 ++
 .../LargeProductionCluster/es-services.yaml     |  277 +++++
 .../LargeProductionCluster/start-up.sh          |   21 +
 .../LargeProductionCluster/teardown.sh          |   20 +
 .../SmallITCluster/elasticsearch-svc-rc.yaml    |   84 ++
 .../elasticsearch/SmallITCluster/start-up.sh    |   22 +
 .../elasticsearch/SmallITCluster/teardown.sh    |   20 +
 .../kubernetes/elasticsearch/data-load-setup.sh |   26 +
 .../kubernetes/elasticsearch/data-load.sh       |   33 +
 .../kubernetes/elasticsearch/es_test_data.py    |  299 +++++
 .../kubernetes/elasticsearch/show-health.sh     |   25 +
 sdks/java/io/hadoop/pom.xml                     |   53 +
 sdks/java/io/pom.xml                            |    2 +-
 45 files changed, 5870 insertions(+), 2752 deletions(-)
----------------------------------------------------------------------

[2/7] beam git commit: HadoopInputFormatIO with junits

Posted by da...@apache.org.

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop/jdk1.8-tests/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/integration/tests/HIFIOCassandraIT.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop/jdk1.8-tests/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/integration/tests/HIFIOCassandraIT.java b/sdks/java/io/hadoop/jdk1.8-tests/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/integration/tests/HIFIOCassandraIT.java
new file mode 100644
index 0000000..bf9a5fd
--- /dev/null
+++ b/sdks/java/io/hadoop/jdk1.8-tests/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/integration/tests/HIFIOCassandraIT.java
@@ -0,0 +1,173 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.sdk.io.hadoop.inputformat.integration.tests;
+
+import com.datastax.driver.core.Row;
+
+import java.io.Serializable;
+
+import org.apache.beam.sdk.io.hadoop.inputformat.HadoopInputFormatIO;
+import org.apache.beam.sdk.io.hadoop.inputformat.custom.options.HIFTestOptions;
+import org.apache.beam.sdk.io.hadoop.inputformat.hashing.HashingFn;
+import org.apache.beam.sdk.options.PipelineOptionsFactory;
+import org.apache.beam.sdk.testing.PAssert;
+import org.apache.beam.sdk.testing.TestPipeline;
+import org.apache.beam.sdk.transforms.Combine;
+import org.apache.beam.sdk.transforms.Count;
+import org.apache.beam.sdk.transforms.SimpleFunction;
+import org.apache.beam.sdk.transforms.Values;
+import org.apache.beam.sdk.values.KV;
+import org.apache.beam.sdk.values.PCollection;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapreduce.InputFormat;
+import org.junit.BeforeClass;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+/**
+ * A test of {@link org.apache.beam.sdk.io.hadoop.inputformat.HadoopInputFormatIO} on an
+ * independent Cassandra instance.
+ *
+ * <p>This test requires a running instance of Cassandra, and the test dataset must exist in
+ * the database.
+ *
+ * <p>You can run this test by doing the following:
+ * <pre>
+ *  mvn -e -Pio-it verify -pl sdks/java/io/hadoop/jdk1.8-tests/HIFIOCassandraIT
+ *  -DintegrationTestPipelineOptions='[
+ *  "--cassandraServerIp=1.2.3.4",
+ *  "--cassandraServerPort=port",
+ *  "--cassandraUserName=user",
+ *  "--cassandraPassword=mypass" ]'
+ * </pre>
+ *
+ * <p>If you want to run this with a runner besides directrunner, there are profiles for dataflow
+ * and spark in the jdk1.8-tests pom. You'll want to activate those in addition to the normal test
+ * runner invocation pipeline options.
+ */
+
+@RunWith(JUnit4.class)
+public class HIFIOCassandraIT implements Serializable {
+
+  private static final String CASSANDRA_KEYSPACE = "ycsb";
+  private static final String CASSANDRA_TABLE = "usertable";
+  private static final String CASSANDRA_THRIFT_PORT_PROPERTY = "cassandra.input.thrift.port";
+  private static final String CASSANDRA_THRIFT_ADDRESS_PROPERTY = "cassandra.input.thrift.address";
+  private static final String CASSANDRA_PARTITIONER_CLASS_PROPERTY =
+      "cassandra.input.partitioner.class";
+  private static final String CASSANDRA_KEYSPACE_PROPERTY = "cassandra.input.keyspace";
+  private static final String CASSANDRA_COLUMNFAMILY_PROPERTY = "cassandra.input.columnfamily";
+  private static final String CASSANDRA_PARTITIONER_CLASS_VALUE = "Murmur3Partitioner";
+  private static final String USERNAME = "cassandra.username";
+  private static final String PASSWORD = "cassandra.password";
+  private static final String INPUT_KEYSPACE_USERNAME_CONFIG = "cassandra.input.keyspace.username";
+  private static final String INPUT_KEYSPACE_PASSWD_CONFIG = "cassandra.input.keyspace.passwd";
+  private static HIFTestOptions options;
+  @Rule
+  public final transient TestPipeline pipeline = TestPipeline.create();
+
+  @BeforeClass
+  public static void setUp() {
+    PipelineOptionsFactory.register(HIFTestOptions.class);
+    options = TestPipeline.testingPipelineOptions().as(HIFTestOptions.class);
+  }
+
+  /**
+   * This test reads data from the Cassandra instance and verifies if data is read successfully.
+   */
+  @Test
+  public void testHIFReadForCassandra() {
+    // Expected hashcode is evaluated during insertion time one time and hardcoded here.
+    String expectedHashCode = "5ea121d90d95c84076f7556605080f4b2c3081a7";
+    Long expectedRecordsCount = 1000L;
+    Configuration conf = getConfiguration(options);
+    PCollection<KV<Long, String>> cassandraData = pipeline.apply(HadoopInputFormatIO
+        .<Long, String>read().withConfiguration(conf).withValueTranslation(myValueTranslate));
+    PAssert.thatSingleton(cassandraData.apply("Count", Count.<KV<Long, String>>globally()))
+        .isEqualTo(expectedRecordsCount);
+    PCollection<String> textValues = cassandraData.apply(Values.<String>create());
+    // Verify the output values using checksum comparison.
+    PCollection<String> consolidatedHashcode =
+        textValues.apply(Combine.globally(new HashingFn()).withoutDefaults());
+    PAssert.that(consolidatedHashcode).containsInAnyOrder(expectedHashCode);
+    pipeline.run().waitUntilFinish();
+  }
+
+  SimpleFunction<Row, String> myValueTranslate = new SimpleFunction<Row, String>() {
+    @Override
+    public String apply(Row input) {
+      return input.getString("y_id") + "|" + input.getString("field0") + "|"
+          + input.getString("field1") + "|" + input.getString("field2") + "|"
+          + input.getString("field3") + "|" + input.getString("field4") + "|"
+          + input.getString("field5") + "|" + input.getString("field6") + "|"
+          + input.getString("field7") + "|" + input.getString("field8") + "|"
+          + input.getString("field9");
+    }
+  };
+  /**
+   * This test reads data from the Cassandra instance based on query and verifies if data is read
+   * successfully.
+   */
+  @Test
+  public void testHIFReadForCassandraQuery() {
+    String expectedHashCode = "a19593e4c72a67e26cb470130864daabf5a99d62";
+    Long expectedNumRows = 1L;
+    Configuration conf = getConfiguration(options);
+    conf.set("cassandra.input.cql", "select * from " + CASSANDRA_KEYSPACE + "." + CASSANDRA_TABLE
+        + " where token(y_id) > ? and token(y_id) <= ? "
+        + "and field0 = 'user48:field0:431531' allow filtering");
+    PCollection<KV<Long, String>> cassandraData =
+        pipeline.apply(HadoopInputFormatIO.<Long, String>read().withConfiguration(conf)
+            .withValueTranslation(myValueTranslate));
+    PAssert.thatSingleton(cassandraData.apply("Count", Count.<KV<Long, String>>globally()))
+        .isEqualTo(expectedNumRows);
+    PCollection<String> textValues = cassandraData.apply(Values.<String>create());
+    // Verify the output values using checksum comparison.
+    PCollection<String> consolidatedHashcode =
+        textValues.apply(Combine.globally(new HashingFn()).withoutDefaults());
+    PAssert.that(consolidatedHashcode).containsInAnyOrder(expectedHashCode);
+    pipeline.run().waitUntilFinish();
+  }
+
+  /**
+   * Returns Hadoop configuration for reading data from Cassandra. To read data from Cassandra using
+   * HadoopInputFormatIO, following properties must be set: InputFormat class, InputFormat key
+   * class, InputFormat value class, Thrift address, Thrift port, partitioner class, keyspace and
+   * columnfamily name.
+   */
+  private static Configuration getConfiguration(HIFTestOptions options) {
+    Configuration conf = new Configuration();
+    conf.set(CASSANDRA_THRIFT_PORT_PROPERTY, options.getCassandraServerPort().toString());
+    conf.set(CASSANDRA_THRIFT_ADDRESS_PROPERTY, options.getCassandraServerIp());
+    conf.set(CASSANDRA_PARTITIONER_CLASS_PROPERTY, CASSANDRA_PARTITIONER_CLASS_VALUE);
+    conf.set(CASSANDRA_KEYSPACE_PROPERTY, CASSANDRA_KEYSPACE);
+    conf.set(CASSANDRA_COLUMNFAMILY_PROPERTY, CASSANDRA_TABLE);
+    // Set user name and password if Cassandra instance has security configured.
+    conf.set(USERNAME, options.getCassandraUserName());
+    conf.set(PASSWORD, options.getCassandraPassword());
+    conf.set(INPUT_KEYSPACE_USERNAME_CONFIG, options.getCassandraUserName());
+    conf.set(INPUT_KEYSPACE_PASSWD_CONFIG, options.getCassandraPassword());
+    conf.setClass("mapreduce.job.inputformat.class",
+        org.apache.cassandra.hadoop.cql3.CqlInputFormat.class, InputFormat.class);
+    conf.setClass("key.class", java.lang.Long.class, Object.class);
+    conf.setClass("value.class", com.datastax.driver.core.Row.class, Object.class);
+    return conf;
+  }
+}

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop/jdk1.8-tests/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/integration/tests/HIFIOElasticIT.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop/jdk1.8-tests/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/integration/tests/HIFIOElasticIT.java b/sdks/java/io/hadoop/jdk1.8-tests/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/integration/tests/HIFIOElasticIT.java
new file mode 100644
index 0000000..13c0cbc
--- /dev/null
+++ b/sdks/java/io/hadoop/jdk1.8-tests/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/integration/tests/HIFIOElasticIT.java
@@ -0,0 +1,215 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package org.apache.beam.sdk.io.hadoop.inputformat.integration.tests;
+
+import java.io.IOException;
+import java.io.Serializable;
+
+import org.apache.beam.sdk.io.hadoop.inputformat.HadoopInputFormatIO;
+import org.apache.beam.sdk.io.hadoop.inputformat.custom.options.HIFTestOptions;
+import org.apache.beam.sdk.io.hadoop.inputformat.hashing.HashingFn;
+import org.apache.beam.sdk.options.PipelineOptionsFactory;
+import org.apache.beam.sdk.testing.PAssert;
+import org.apache.beam.sdk.testing.TestPipeline;
+import org.apache.beam.sdk.transforms.Combine;
+import org.apache.beam.sdk.transforms.Count;
+import org.apache.beam.sdk.transforms.MapElements;
+import org.apache.beam.sdk.transforms.SimpleFunction;
+import org.apache.beam.sdk.transforms.Values;
+import org.apache.beam.sdk.values.KV;
+import org.apache.beam.sdk.values.PCollection;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.MapWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.InputFormat;
+import org.elasticsearch.hadoop.cfg.ConfigurationOptions;
+import org.elasticsearch.hadoop.mr.LinkedMapWritable;
+import org.junit.BeforeClass;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+/**
+ * A test of {@link org.apache.beam.sdk.io.hadoop.inputformat.HadoopInputFormatIO} on an
+ * independent Elasticsearch instance.
+ *
+ * <p>This test requires a running instance of Elasticsearch, and the test dataset must exist in
+ * the database.
+ *
+ * <p>You can run this test by doing the following:
+ * <pre>
+ *  mvn -e -Pio-it verify -pl sdks/java/io/hadoop/jdk1.8-tests/HIFIOElasticIT
+ *  -DintegrationTestPipelineOptions='[
+ *  "--elasticServerIp=1.2.3.4",
+ *  "--elasticServerPort=port",
+ *  "--elasticUserName=user",
+ *  "--elasticPassword=mypass" ]'
+ * </pre>
+ *
+ * <p>If you want to run this with a runner besides directrunner, there are profiles for dataflow
+ * and spark in the jdk1.8-tests pom. You'll want to activate those in addition to the normal test
+ * runner invocation pipeline options.
+ */
+
+@RunWith(JUnit4.class)
+public class HIFIOElasticIT implements Serializable {
+
+  private static final String ELASTIC_INTERNAL_VERSION = "5.x";
+  private static final String TRUE = "true";
+  private static final String ELASTIC_INDEX_NAME = "test_data";
+  private static final String ELASTIC_TYPE_NAME = "test_type";
+  private static final String ELASTIC_RESOURCE = "/" + ELASTIC_INDEX_NAME + "/" + ELASTIC_TYPE_NAME;
+  private static HIFTestOptions options;
+  @Rule
+  public final transient TestPipeline pipeline = TestPipeline.create();
+
+  @BeforeClass
+  public static void setUp() {
+    PipelineOptionsFactory.register(HIFTestOptions.class);
+    options = TestPipeline.testingPipelineOptions().as(HIFTestOptions.class);
+  }
+
+  /**
+   * This test reads data from the Elasticsearch instance and verifies whether data is read
+   * successfully.
+   */
+  @Test
+  public void testHifIOWithElastic() throws SecurityException, IOException {
+    // Expected hashcode is evaluated during insertion time one time and hardcoded here.
+    final long expectedRowCount = 1000L;
+    String expectedHashCode = "ed36c09b5e24a95fd8d3cc711a043a85320bb47d";
+    Configuration conf = getConfiguration(options);
+    PCollection<KV<Text, LinkedMapWritable>> esData =
+        pipeline.apply(HadoopInputFormatIO.<Text, LinkedMapWritable>read().withConfiguration(conf));
+    // Verify that the count of objects fetched using HIFInputFormat IO is correct.
+    PCollection<Long> count = esData.apply(Count.<KV<Text, LinkedMapWritable>>globally());
+    PAssert.thatSingleton(count).isEqualTo(expectedRowCount);
+    PCollection<LinkedMapWritable> values = esData.apply(Values.<LinkedMapWritable>create());
+    PCollection<String> textValues = values.apply(transformFunc);
+    // Verify the output values using checksum comparison.
+    PCollection<String> consolidatedHashcode =
+        textValues.apply(Combine.globally(new HashingFn()).withoutDefaults());
+    PAssert.that(consolidatedHashcode).containsInAnyOrder(expectedHashCode);
+    pipeline.run().waitUntilFinish();
+  }
+
+  MapElements<LinkedMapWritable, String> transformFunc =
+      MapElements.<LinkedMapWritable, String>via(new SimpleFunction<LinkedMapWritable, String>() {
+        @Override
+        public String apply(LinkedMapWritable mapw) {
+          String rowValue = "";
+          rowValue = convertMapWRowToString(mapw);
+          return rowValue;
+        }
+      });
+  /*
+   * Function to create a toString implementation of a MapWritable row by writing all field values
+   * in a string row.
+   */
+  private String convertMapWRowToString(LinkedMapWritable mapw) {
+    String rowValue = "";
+    rowValue = addFieldValuesToRow(rowValue, mapw, "User_Name");
+    rowValue = addFieldValuesToRow(rowValue, mapw, "Item_Code");
+    rowValue = addFieldValuesToRow(rowValue, mapw, "Txn_ID");
+    rowValue = addFieldValuesToRow(rowValue, mapw, "Item_ID");
+    rowValue = addFieldValuesToRow(rowValue, mapw, "last_updated");
+    rowValue = addFieldValuesToRow(rowValue, mapw, "Price");
+    rowValue = addFieldValuesToRow(rowValue, mapw, "Title");
+    rowValue = addFieldValuesToRow(rowValue, mapw, "Description");
+    rowValue = addFieldValuesToRow(rowValue, mapw, "Age");
+    rowValue = addFieldValuesToRow(rowValue, mapw, "Item_Name");
+    rowValue = addFieldValuesToRow(rowValue, mapw, "Item_Price");
+    rowValue = addFieldValuesToRow(rowValue, mapw, "Availability");
+    rowValue = addFieldValuesToRow(rowValue, mapw, "Batch_Num");
+    rowValue = addFieldValuesToRow(rowValue, mapw, "Last_Ordered");
+    rowValue = addFieldValuesToRow(rowValue, mapw, "City");
+    return rowValue;
+  }
+
+  /*
+   * Convert a MapWritable row field into a string, and append it to the row string with a
+   * separator.
+   */
+  private String addFieldValuesToRow(String row, MapWritable mapw, String columnName) {
+    Object valueObj = (Object) mapw.get(new Text(columnName));
+    row += valueObj.toString() + "|";
+    return row;
+  }
+
+  /**
+   * This test reads data from the Elasticsearch instance based on a query and verifies if data is
+   * read successfully.
+   */
+  @Test
+  public void testHifIOWithElasticQuery() {
+    String expectedHashCode = "83c108ff81e87b6f3807c638e6bb9a9e3d430dc7";
+    Long expectedRecordsCount = 1L;
+    Configuration conf = getConfiguration(options);
+    String query = "{"
+                  + "  \"query\": {"
+                  + "  \"match\" : {"
+                  + "    \"Title\" : {"
+                  + "      \"query\" : \"Title9\","
+                  + "      \"type\" : \"boolean\""
+                  + "    }"
+                  + "  }"
+                  + "  }"
+                  + "}";
+    conf.set(ConfigurationOptions.ES_QUERY, query);
+    PCollection<KV<Text, LinkedMapWritable>> esData =
+        pipeline.apply(HadoopInputFormatIO.<Text, LinkedMapWritable>read().withConfiguration(conf));
+    PCollection<Long> count = esData.apply(Count.<KV<Text, LinkedMapWritable>>globally());
+    // Verify that the count of objects fetched using HIFInputFormat IO is correct.
+    PAssert.thatSingleton(count).isEqualTo(expectedRecordsCount);
+    PCollection<LinkedMapWritable> values = esData.apply(Values.<LinkedMapWritable>create());
+    PCollection<String> textValues = values.apply(transformFunc);
+    // Verify the output values using checksum comparison.
+    PCollection<String> consolidatedHashcode =
+        textValues.apply(Combine.globally(new HashingFn()).withoutDefaults());
+    PAssert.that(consolidatedHashcode).containsInAnyOrder(expectedHashCode);
+    pipeline.run().waitUntilFinish();
+  }
+
+  /**
+   * Returns Hadoop configuration for reading data from Elasticsearch. Configuration object should
+   * have InputFormat class, key class and value class to be set. Mandatory fields for ESInputFormat
+   * to be set are es.resource, es.nodes, es.port, es.internal.es.version, es.nodes.wan.only. Please
+   * refer <a href="https://www.elastic.co/guide/en/elasticsearch/hadoop/current/configuration.html"
+   * >Elasticsearch Configuration</a> for more details.
+   */
+  private static Configuration getConfiguration(HIFTestOptions options) {
+    Configuration conf = new Configuration();
+    conf.set(ConfigurationOptions.ES_NODES, options.getElasticServerIp());
+    conf.set(ConfigurationOptions.ES_PORT, options.getElasticServerPort().toString());
+    conf.set(ConfigurationOptions.ES_NODES_WAN_ONLY, TRUE);
+    // Set username and password if Elasticsearch is configured with security.
+    conf.set(ConfigurationOptions.ES_NET_HTTP_AUTH_USER, options.getElasticUserName());
+    conf.set(ConfigurationOptions.ES_NET_HTTP_AUTH_PASS, options.getElasticPassword());
+    conf.set(ConfigurationOptions.ES_RESOURCE, ELASTIC_RESOURCE);
+    conf.set("es.internal.es.version", ELASTIC_INTERNAL_VERSION);
+    conf.set(ConfigurationOptions.ES_INDEX_AUTO_CREATE, TRUE);
+    conf.setClass("mapreduce.job.inputformat.class",
+        org.elasticsearch.hadoop.mr.EsInputFormat.class, InputFormat.class);
+    conf.setClass("key.class", Text.class, Object.class);
+    conf.setClass("value.class", LinkedMapWritable.class, Object.class);
+    // Optimizations added to change the max docs per partition, scroll size and batch size of
+    // bytes to improve the test time for large data
+    conf.set("es.input.max.docs.per.partition", "50000");
+    conf.set("es.scroll.size", "400");
+    conf.set("es.batch.size.bytes", "8mb");
+    return conf;
+  }
+}

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/cassandra.yaml
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/cassandra.yaml b/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/cassandra.yaml
new file mode 100644
index 0000000..ca1e48f
--- /dev/null
+++ b/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/cassandra.yaml
@@ -0,0 +1,1074 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+# Cassandra storage config YAML required for Embedded Cassandra server test
+
+# NOTE:
+#   See http://wiki.apache.org/cassandra/StorageConfiguration for
+#   full explanations of configuration directives
+# /NOTE
+
+# The name of the cluster. This is mainly used to prevent machines in
+# one logical cluster from joining another.
+cluster_name: 'beam'
+
+# This defines the number of tokens randomly assigned to this node on the ring
+# The more tokens, relative to other nodes, the larger the proportion of data
+# that this node will store. You probably want all nodes to have the same number
+# of tokens assuming they have equal hardware capability.
+#
+# If you leave this unspecified, Cassandra will use the default of 1 token for legacy compatibility,
+# and will use the initial_token as described below.
+#
+# Specifying initial_token will override this setting on the node's initial start,
+# on subsequent starts, this setting will apply even if initial token is set.
+#
+# If you already have a cluster with 1 token per node, and wish to migrate to 
+# multiple tokens per node, see http://wiki.apache.org/cassandra/Operations
+num_tokens: 1
+
+# Triggers automatic allocation of num_tokens tokens for this node. The allocation
+# algorithm attempts to choose tokens in a way that optimizes replicated load over
+# the nodes in the datacenter for the replication strategy used by the specified
+# keyspace.
+#
+# The load assigned to each node will be close to proportional to its number of
+# vnodes.
+#
+# Only supported with the Murmur3Partitioner.
+# allocate_tokens_for_keyspace: KEYSPACE
+
+# initial_token allows you to specify tokens manually.  While you can use # it with
+# vnodes (num_tokens > 1, above) -- in which case you should provide a 
+# comma-separated list -- it's primarily used when adding nodes # to legacy clusters 
+# that do not have vnodes enabled.
+# initial_token:
+
+# See http://wiki.apache.org/cassandra/HintedHandoff
+# May either be "true" or "false" to enable globally
+hinted_handoff_enabled: true
+# When hinted_handoff_enabled is true, a black list of data centers that will not
+# perform hinted handoff
+#hinted_handoff_disabled_datacenters:
+#    - DC1
+#    - DC2
+# this defines the maximum amount of time a dead host will have hints
+# generated.  After it has been dead this long, new hints for it will not be
+# created until it has been seen alive and gone down again.
+max_hint_window_in_ms: 10800000 # 3 hours
+
+# Maximum throttle in KBs per second, per delivery thread.  This will be
+# reduced proportionally to the number of nodes in the cluster.  (If there
+# are two nodes in the cluster, each delivery thread will use the maximum
+# rate; if there are three, each will throttle to half of the maximum,
+# since we expect two nodes to be delivering hints simultaneously.)
+hinted_handoff_throttle_in_kb: 1024
+
+# Number of threads with which to deliver hints;
+# Consider increasing this number when you have multi-dc deployments, since
+# cross-dc handoff tends to be slower
+max_hints_delivery_threads: 2
+
+# Directory where Cassandra should store hints.
+# If not set, the default directory is $CASSANDRA_HOME/data/hints.
+# hints_directory: /var/lib/cassandra/hints
+hints_directory: target/cassandra/hints
+
+# How often hints should be flushed from the internal buffers to disk.
+# Will *not* trigger fsync.
+hints_flush_period_in_ms: 10000
+
+# Maximum size for a single hints file, in megabytes.
+max_hints_file_size_in_mb: 128
+
+# Compression to apply to the hint files. If omitted, hints files
+# will be written uncompressed. LZ4, Snappy, and Deflate compressors
+# are supported.
+#hints_compression:
+#   - class_name: LZ4Compressor
+#     parameters:
+#         -
+
+# Maximum throttle in KBs per second, total. This will be
+# reduced proportionally to the number of nodes in the cluster.
+batchlog_replay_throttle_in_kb: 1024
+
+# Authentication backend, implementing IAuthenticator; used to identify users
+# Out of the box, Cassandra provides org.apache.cassandra.auth.{AllowAllAuthenticator,
+# PasswordAuthenticator}.
+#
+# - AllowAllAuthenticator performs no checks - set it to disable authentication.
+# - PasswordAuthenticator relies on username/password pairs to authenticate
+#   users. It keeps usernames and hashed passwords in system_auth.credentials table.
+#   Please increase system_auth keyspace replication factor if you use this authenticator.
+#   If using PasswordAuthenticator, CassandraRoleManager must also be used (see below)
+authenticator: AllowAllAuthenticator
+
+# Authorization backend, implementing IAuthorizer; used to limit access/provide permissions
+# Out of the box, Cassandra provides org.apache.cassandra.auth.{AllowAllAuthorizer,
+# CassandraAuthorizer}.
+#
+# - AllowAllAuthorizer allows any action to any user - set it to disable authorization.
+# - CassandraAuthorizer stores permissions in system_auth.permissions table. Please
+#   increase system_auth keyspace replication factor if you use this authorizer.
+authorizer: AllowAllAuthorizer
+
+# Part of the Authentication & Authorization backend, implementing IRoleManager; used
+# to maintain grants and memberships between roles.
+# Out of the box, Cassandra provides org.apache.cassandra.auth.CassandraRoleManager,
+# which stores role information in the system_auth keyspace. Most functions of the
+# IRoleManager require an authenticated login, so unless the configured IAuthenticator
+# actually implements authentication, most of this functionality will be unavailable.
+#
+# - CassandraRoleManager stores role data in the system_auth keyspace. Please
+#   increase system_auth keyspace replication factor if you use this role manager.
+role_manager: CassandraRoleManager
+
+# Validity period for roles cache (fetching granted roles can be an expensive
+# operation depending on the role manager, CassandraRoleManager is one example)
+# Granted roles are cached for authenticated sessions in AuthenticatedUser and
+# after the period specified here, become eligible for (async) reload.
+# Defaults to 2000, set to 0 to disable caching entirely.
+# Will be disabled automatically for AllowAllAuthenticator.
+roles_validity_in_ms: 2000
+
+# Refresh interval for roles cache (if enabled).
+# After this interval, cache entries become eligible for refresh. Upon next
+# access, an async reload is scheduled and the old value returned until it
+# completes. If roles_validity_in_ms is non-zero, then this must be
+# also.
+# Defaults to the same value as roles_validity_in_ms.
+# roles_update_interval_in_ms: 2000
+
+# Validity period for permissions cache (fetching permissions can be an
+# expensive operation depending on the authorizer, CassandraAuthorizer is
+# one example). Defaults to 2000, set to 0 to disable.
+# Will be disabled automatically for AllowAllAuthorizer.
+permissions_validity_in_ms: 2000
+
+# Refresh interval for permissions cache (if enabled).
+# After this interval, cache entries become eligible for refresh. Upon next
+# access, an async reload is scheduled and the old value returned until it
+# completes. If permissions_validity_in_ms is non-zero, then this must be
+# also.
+# Defaults to the same value as permissions_validity_in_ms.
+# permissions_update_interval_in_ms: 2000
+
+# Validity period for credentials cache. This cache is tightly coupled to
+# the provided PasswordAuthenticator implementation of IAuthenticator. If
+# another IAuthenticator implementation is configured, this cache will not
+# be automatically used and so the following settings will have no effect.
+# Please note, credentials are cached in their encrypted form, so while
+# activating this cache may reduce the number of queries made to the
+# underlying table, it may not  bring a significant reduction in the
+# latency of individual authentication attempts.
+# Defaults to 2000, set to 0 to disable credentials caching.
+credentials_validity_in_ms: 2000
+
+# Refresh interval for credentials cache (if enabled).
+# After this interval, cache entries become eligible for refresh. Upon next
+# access, an async reload is scheduled and the old value returned until it
+# completes. If credentials_validity_in_ms is non-zero, then this must be
+# also.
+# Defaults to the same value as credentials_validity_in_ms.
+# credentials_update_interval_in_ms: 2000
+
+# The partitioner is responsible for distributing groups of rows (by
+# partition key) across nodes in the cluster.  You should leave this
+# alone for new clusters.  The partitioner can NOT be changed without
+# reloading all data, so when upgrading you should set this to the
+# same partitioner you were already using.
+#
+# Besides Murmur3Partitioner, partitioners included for backwards
+# compatibility include RandomPartitioner, ByteOrderedPartitioner, and
+# OrderPreservingPartitioner.
+#
+partitioner: org.apache.cassandra.dht.Murmur3Partitioner
+
+# Directories where Cassandra should store data on disk.  Cassandra
+# will spread data evenly across them, subject to the granularity of
+# the configured compaction strategy.
+# If not set, the default directory is $CASSANDRA_HOME/data/data.
+# data_file_directories:
+#     - /var/lib/cassandra/data
+data_file_directories:
+      - target/cassandra/data
+
+# commit log.  when running on magnetic HDD, this should be a
+# separate spindle than the data directories.
+# If not set, the default directory is $CASSANDRA_HOME/data/commitlog.
+# commitlog_directory: /var/lib/cassandra/commitlog
+commitlog_directory: target/cassandra/commitlog
+cdc_raw_directory: target/cassandra/cdc_raw
+# policy for data disk failures:
+# die: shut down gossip and client transports and kill the JVM for any fs errors or
+#      single-sstable errors, so the node can be replaced.
+# stop_paranoid: shut down gossip and client transports even for single-sstable errors,
+#                kill the JVM for errors during startup.
+# stop: shut down gossip and client transports, leaving the node effectively dead, but
+#       can still be inspected via JMX, kill the JVM for errors during startup.
+# best_effort: stop using the failed disk and respond to requests based on
+#              remaining available sstables.  This means you WILL see obsolete
+#              data at CL.ONE!
+# ignore: ignore fatal errors and let requests fail, as in pre-1.2 Cassandra
+disk_failure_policy: stop
+
+# policy for commit disk failures:
+# die: shut down gossip and Thrift and kill the JVM, so the node can be replaced.
+# stop: shut down gossip and Thrift, leaving the node effectively dead, but
+#       can still be inspected via JMX.
+# stop_commit: shutdown the commit log, letting writes collect but
+#              continuing to service reads, as in pre-2.0.5 Cassandra
+# ignore: ignore fatal errors and let the batches fail
+commit_failure_policy: stop
+
+# Maximum size of the native protocol prepared statement cache
+#
+# Valid values are either "auto" (omitting the value) or a value greater 0.
+#
+# Note that specifying a too large value will result in long running GCs and possbily
+# out-of-memory errors. Keep the value at a small fraction of the heap.
+#
+# If you constantly see "prepared statements discarded in the last minute because
+# cache limit reached" messages, the first step is to investigate the root cause
+# of these messages and check whether prepared statements are used correctly -
+# i.e. use bind markers for variable parts.
+#
+# Do only change the default value, if you really have more prepared statements than
+# fit in the cache. In most cases it is not neccessary to change this value.
+# Constantly re-preparing statements is a performance penalty.
+#
+# Default value ("auto") is 1/256th of the heap or 10MB, whichever is greater
+prepared_statements_cache_size_mb:
+
+# Maximum size of the Thrift prepared statement cache
+#
+# If you do not use Thrift at all, it is safe to leave this value at "auto".
+#
+# See description of 'prepared_statements_cache_size_mb' above for more information.
+#
+# Default value ("auto") is 1/256th of the heap or 10MB, whichever is greater
+thrift_prepared_statements_cache_size_mb:
+
+# Maximum size of the key cache in memory.
+#
+# Each key cache hit saves 1 seek and each row cache hit saves 2 seeks at the
+# minimum, sometimes more. The key cache is fairly tiny for the amount of
+# time it saves, so it's worthwhile to use it at large numbers.
+# The row cache saves even more time, but must contain the entire row,
+# so it is extremely space-intensive. It's best to only use the
+# row cache if you have hot rows or static rows.
+#
+# NOTE: if you reduce the size, you may not get you hottest keys loaded on startup.
+#
+# Default value is empty to make it "auto" (min(5% of Heap (in MB), 100MB)). Set to 0 to disable key cache.
+key_cache_size_in_mb:
+
+# Duration in seconds after which Cassandra should
+# save the key cache. Caches are saved to saved_caches_directory as
+# specified in this configuration file.
+#
+# Saved caches greatly improve cold-start speeds, and is relatively cheap in
+# terms of I/O for the key cache. Row cache saving is much more expensive and
+# has limited use.
+#
+# Default is 14400 or 4 hours.
+key_cache_save_period: 14400
+
+# Number of keys from the key cache to save
+# Disabled by default, meaning all keys are going to be saved
+# key_cache_keys_to_save: 100
+
+# Row cache implementation class name.
+# Available implementations:
+#   org.apache.cassandra.cache.OHCProvider                Fully off-heap row cache implementation (default).
+#   org.apache.cassandra.cache.SerializingCacheProvider   This is the row cache implementation availabile
+#                                                         in previous releases of Cassandra.
+# row_cache_class_name: org.apache.cassandra.cache.OHCProvider
+
+# Maximum size of the row cache in memory.
+# Please note that OHC cache implementation requires some additional off-heap memory to manage
+# the map structures and some in-flight memory during operations before/after cache entries can be
+# accounted against the cache capacity. This overhead is usually small compared to the whole capacity.
+# Do not specify more memory that the system can afford in the worst usual situation and leave some
+# headroom for OS block level cache. Do never allow your system to swap.
+#
+# Default value is 0, to disable row caching.
+row_cache_size_in_mb: 0
+
+# Duration in seconds after which Cassandra should save the row cache.
+# Caches are saved to saved_caches_directory as specified in this configuration file.
+#
+# Saved caches greatly improve cold-start speeds, and is relatively cheap in
+# terms of I/O for the key cache. Row cache saving is much more expensive and
+# has limited use.
+#
+# Default is 0 to disable saving the row cache.
+row_cache_save_period: 0
+
+# Number of keys from the row cache to save.
+# Specify 0 (which is the default), meaning all keys are going to be saved
+# row_cache_keys_to_save: 100
+
+# Maximum size of the counter cache in memory.
+#
+# Counter cache helps to reduce counter locks' contention for hot counter cells.
+# In case of RF = 1 a counter cache hit will cause Cassandra to skip the read before
+# write entirely. With RF > 1 a counter cache hit will still help to reduce the duration
+# of the lock hold, helping with hot counter cell updates, but will not allow skipping
+# the read entirely. Only the local (clock, count) tuple of a counter cell is kept
+# in memory, not the whole counter, so it's relatively cheap.
+#
+# NOTE: if you reduce the size, you may not get you hottest keys loaded on startup.
+#
+# Default value is empty to make it "auto" (min(2.5% of Heap (in MB), 50MB)). Set to 0 to disable counter cache.
+# NOTE: if you perform counter deletes and rely on low gcgs, you should disable the counter cache.
+counter_cache_size_in_mb:
+
+# Duration in seconds after which Cassandra should
+# save the counter cache (keys only). Caches are saved to saved_caches_directory as
+# specified in this configuration file.
+#
+# Default is 7200 or 2 hours.
+counter_cache_save_period: 7200
+
+# Number of keys from the counter cache to save
+# Disabled by default, meaning all keys are going to be saved
+# counter_cache_keys_to_save: 100
+
+# saved caches
+# If not set, the default directory is $CASSANDRA_HOME/data/saved_caches.
+# saved_caches_directory: /var/lib/cassandra/saved_caches
+saved_caches_directory: target/cassandra/saved_caches
+
+# commitlog_sync may be either "periodic" or "batch." 
+# 
+# When in batch mode, Cassandra won't ack writes until the commit log
+# has been fsynced to disk.  It will wait
+# commitlog_sync_batch_window_in_ms milliseconds between fsyncs.
+# This window should be kept short because the writer threads will
+# be unable to do extra work while waiting.  (You may need to increase
+# concurrent_writes for the same reason.)
+#
+# commitlog_sync: batch
+# commitlog_sync_batch_window_in_ms: 2
+#
+# the other option is "periodic" where writes may be acked immediately
+# and the CommitLog is simply synced every commitlog_sync_period_in_ms
+# milliseconds. 
+commitlog_sync: periodic
+commitlog_sync_period_in_ms: 1
+
+# The size of the individual commitlog file segments.  A commitlog
+# segment may be archived, deleted, or recycled once all the data
+# in it (potentially from each columnfamily in the system) has been
+# flushed to sstables.
+#
+# The default size is 32, which is almost always fine, but if you are
+# archiving commitlog segments (see commitlog_archiving.properties),
+# then you probably want a finer granularity of archiving; 8 or 16 MB
+# is reasonable.
+# Max mutation size is also configurable via max_mutation_size_in_kb setting in
+# cassandra.yaml. The default is half the size commitlog_segment_size_in_mb * 1024.
+#
+# NOTE: If max_mutation_size_in_kb is set explicitly then commitlog_segment_size_in_mb must
+# be set to at least twice the size of max_mutation_size_in_kb / 1024
+#
+commitlog_segment_size_in_mb: 32
+
+# Compression to apply to the commit log. If omitted, the commit log
+# will be written uncompressed.  LZ4, Snappy, and Deflate compressors
+# are supported.
+#commitlog_compression:
+#   - class_name: LZ4Compressor
+#     parameters:
+#         -
+
+# any class that implements the SeedProvider interface and has a
+# constructor that takes a Map<String, String> of parameters will do.
+seed_provider:
+    # Addresses of hosts that are deemed contact points. 
+    # Cassandra nodes use this list of hosts to find each other and learn
+    # the topology of the ring.  You must change this if you are running
+    # multiple nodes!
+    - class_name: org.apache.cassandra.locator.SimpleSeedProvider
+      parameters:
+          # seeds is actually a comma-delimited list of addresses.
+          # Ex: "<ip1>,<ip2>,<ip3>"
+          - seeds: "127.0.0.1"
+
+# For workloads with more data than can fit in memory, Cassandra's
+# bottleneck will be reads that need to fetch data from
+# disk. "concurrent_reads" should be set to (16 * number_of_drives) in
+# order to allow the operations to enqueue low enough in the stack
+# that the OS and drives can reorder them. Same applies to
+# "concurrent_counter_writes", since counter writes read the current
+# values before incrementing and writing them back.
+#
+# On the other hand, since writes are almost never IO bound, the ideal
+# number of "concurrent_writes" is dependent on the number of cores in
+# your system; (8 * number_of_cores) is a good rule of thumb.
+concurrent_reads: 32
+concurrent_writes: 32
+concurrent_counter_writes: 32
+
+# For materialized view writes, as there is a read involved, so this should
+# be limited by the less of concurrent reads or concurrent writes.
+concurrent_materialized_view_writes: 32
+
+# Maximum memory to use for sstable chunk cache and buffer pooling.
+# 32MB of this are reserved for pooling buffers, the rest is used as an
+# cache that holds uncompressed sstable chunks.
+# Defaults to the smaller of 1/4 of heap or 512MB. This pool is allocated off-heap,
+# so is in addition to the memory allocated for heap. The cache also has on-heap
+# overhead which is roughly 128 bytes per chunk (i.e. 0.2% of the reserved size
+# if the default 64k chunk size is used).
+# Memory is only allocated when needed.
+# file_cache_size_in_mb: 512
+
+# Flag indicating whether to allocate on or off heap when the sstable buffer
+# pool is exhausted, that is when it has exceeded the maximum memory
+# file_cache_size_in_mb, beyond which it will not cache buffers but allocate on request.
+
+# buffer_pool_use_heap_if_exhausted: true
+
+# The strategy for optimizing disk read
+# Possible values are:
+# ssd (for solid state disks, the default)
+# spinning (for spinning disks)
+# disk_optimization_strategy: ssd
+
+# Total permitted memory to use for memtables. Cassandra will stop
+# accepting writes when the limit is exceeded until a flush completes,
+# and will trigger a flush based on memtable_cleanup_threshold
+# If omitted, Cassandra will set both to 1/4 the size of the heap.
+# memtable_heap_space_in_mb: 2048
+# memtable_offheap_space_in_mb: 2048
+
+# Ratio of occupied non-flushing memtable size to total permitted size
+# that will trigger a flush of the largest memtable. Larger mct will
+# mean larger flushes and hence less compaction, but also less concurrent
+# flush activity which can make it difficult to keep your disks fed
+# under heavy write load.
+#
+# memtable_cleanup_threshold defaults to 1 / (memtable_flush_writers + 1)
+# memtable_cleanup_threshold: 0.11
+
+# Specify the way Cassandra allocates and manages memtable memory.
+# Options are:
+#   heap_buffers:    on heap nio buffers
+#   offheap_buffers: off heap (direct) nio buffers
+#   offheap_objects: off heap objects
+memtable_allocation_type: heap_buffers
+
+# Total space to use for commit logs on disk.
+#
+# If space gets above this value, Cassandra will flush every dirty CF
+# in the oldest segment and remove it.  So a small total commitlog space
+# will tend to cause more flush activity on less-active columnfamilies.
+#
+# The default value is the smaller of 8192, and 1/4 of the total space
+# of the commitlog volume.
+#
+# commitlog_total_space_in_mb: 8192
+
+# This sets the amount of memtable flush writer threads.  These will
+# be blocked by disk io, and each one will hold a memtable in memory
+# while blocked.
+#
+# memtable_flush_writers defaults to one per data_file_directory.
+#
+# If your data directories are backed by SSD, you can increase this, but
+# avoid having memtable_flush_writers * data_file_directories > number of cores
+#memtable_flush_writers: 1
+
+# A fixed memory pool size in MB for for SSTable index summaries. If left
+# empty, this will default to 5% of the heap size. If the memory usage of
+# all index summaries exceeds this limit, SSTables with low read rates will
+# shrink their index summaries in order to meet this limit.  However, this
+# is a best-effort process. In extreme conditions Cassandra may need to use
+# more than this amount of memory.
+index_summary_capacity_in_mb:
+
+# How frequently index summaries should be resampled.  This is done
+# periodically to redistribute memory from the fixed-size pool to sstables
+# proportional their recent read rates.  Setting to -1 will disable this
+# process, leaving existing index summaries at their current sampling level.
+index_summary_resize_interval_in_minutes: 60
+
+# Whether to, when doing sequential writing, fsync() at intervals in
+# order to force the operating system to flush the dirty
+# buffers. Enable this to avoid sudden dirty buffer flushing from
+# impacting read latencies. Almost always a good idea on SSDs; not
+# necessarily on platters.
+trickle_fsync: false
+trickle_fsync_interval_in_kb: 10240
+
+# TCP port, for commands and data
+# For security reasons, you should not expose this port to the internet.  Firewall it if needed.
+storage_port: 7000
+
+# SSL port, for encrypted communication.  Unused unless enabled in
+# encryption_options
+# For security reasons, you should not expose this port to the internet.  Firewall it if needed.
+ssl_storage_port: 7001
+
+# Address or interface to bind to and tell other Cassandra nodes to connect to.
+# You _must_ change this if you want multiple nodes to be able to communicate!
+#
+# Set listen_address OR listen_interface, not both. Interfaces must correspond
+# to a single address, IP aliasing is not supported.
+#
+# Leaving it blank leaves it up to InetAddress.getLocalHost(). This
+# will always do the Right Thing _if_ the node is properly configured
+# (hostname, name resolution, etc), and the Right Thing is to use the
+# address associated with the hostname (it might not be).
+#
+# Setting listen_address to 0.0.0.0 is always wrong.
+#
+# If you choose to specify the interface by name and the interface has an ipv4 and an ipv6 address
+# you can specify which should be chosen using listen_interface_prefer_ipv6. If false the first ipv4
+# address will be used. If true the first ipv6 address will be used. Defaults to false preferring
+# ipv4. If there is only one address it will be selected regardless of ipv4/ipv6.
+listen_address: localhost
+# listen_interface: eth0
+# listen_interface_prefer_ipv6: false
+
+# Address to broadcast to other Cassandra nodes
+# Leaving this blank will set it to the same value as listen_address
+# broadcast_address: 1.2.3.4
+
+# When using multiple physical network interfaces, set this
+# to true to listen on broadcast_address in addition to
+# the listen_address, allowing nodes to communicate in both
+# interfaces.
+# Ignore this property if the network configuration automatically
+# routes  between the public and private networks such as EC2.
+# listen_on_broadcast_address: false
+
+# Internode authentication backend, implementing IInternodeAuthenticator;
+# used to allow/disallow connections from peer nodes.
+# internode_authenticator: org.apache.cassandra.auth.AllowAllInternodeAuthenticator
+
+# Whether to start the native transport server.
+# Please note that the address on which the native transport is bound is the
+# same as the rpc_address. The port however is different and specified below.
+start_native_transport: true
+# port for the CQL native transport to listen for clients on
+# For security reasons, you should not expose this port to the internet.  Firewall it if needed.
+native_transport_port: 9042
+# Enabling native transport encryption in client_encryption_options allows you to either use
+# encryption for the standard port or to use a dedicated, additional port along with the unencrypted
+# standard native_transport_port.
+# Enabling client encryption and keeping native_transport_port_ssl disabled will use encryption
+# for native_transport_port. Setting native_transport_port_ssl to a different value
+# from native_transport_port will use encryption for native_transport_port_ssl while
+# keeping native_transport_port unencrypted.
+# native_transport_port_ssl: 9142
+# The maximum threads for handling requests when the native transport is used.
+# This is similar to rpc_max_threads though the default differs slightly (and
+# there is no native_transport_min_threads, idle threads will always be stopped
+# after 30 seconds).
+# native_transport_max_threads: 128
+#
+# The maximum size of allowed frame. Frame (requests) larger than this will
+# be rejected as invalid. The default is 256MB. If you're changing this parameter,
+# you may want to adjust max_value_size_in_mb accordingly.
+# native_transport_max_frame_size_in_mb: 256
+
+# The maximum number of concurrent client connections.
+# The default is -1, which means unlimited.
+# native_transport_max_concurrent_connections: -1
+
+# The maximum number of concurrent client connections per source ip.
+# The default is -1, which means unlimited.
+# native_transport_max_concurrent_connections_per_ip: -1
+
+# Whether to start the thrift rpc server.
+start_rpc: true
+
+# The address or interface to bind the Thrift RPC service and native transport
+# server to.
+#
+# Set rpc_address OR rpc_interface, not both. Interfaces must correspond
+# to a single address, IP aliasing is not supported.
+#
+# Leaving rpc_address blank has the same effect as on listen_address
+# (i.e. it will be based on the configured hostname of the node).
+#
+# Note that unlike listen_address, you can specify 0.0.0.0, but you must also
+# set broadcast_rpc_address to a value other than 0.0.0.0.
+#
+# For security reasons, you should not expose this port to the internet.  Firewall it if needed.
+#
+# If you choose to specify the interface by name and the interface has an ipv4 and an ipv6 address
+# you can specify which should be chosen using rpc_interface_prefer_ipv6. If false the first ipv4
+# address will be used. If true the first ipv6 address will be used. Defaults to false preferring
+# ipv4. If there is only one address it will be selected regardless of ipv4/ipv6.
+rpc_address: 127.0.0.1
+# rpc_interface: eth1
+# rpc_interface_prefer_ipv6: false
+
+# port for Thrift to listen for clients on
+rpc_port: 9160
+
+# RPC address to broadcast to drivers and other Cassandra nodes. This cannot
+# be set to 0.0.0.0. If left blank, this will be set to the value of
+# rpc_address. If rpc_address is set to 0.0.0.0, broadcast_rpc_address must
+# be set.
+# broadcast_rpc_address: 1.2.3.4
+
+# enable or disable keepalive on rpc/native connections
+rpc_keepalive: true
+
+# Cassandra provides two out-of-the-box options for the RPC Server:
+#
+# sync  -> One thread per thrift connection. For a very large number of clients, memory
+#          will be your limiting factor. On a 64 bit JVM, 180KB is the minimum stack size
+#          per thread, and that will correspond to your use of virtual memory (but physical memory
+#          may be limited depending on use of stack space).
+#
+# hsha  -> Stands for "half synchronous, half asynchronous." All thrift clients are handled
+#          asynchronously using a small number of threads that does not vary with the amount
+#          of thrift clients (and thus scales well to many clients). The rpc requests are still
+#          synchronous (one thread per active request). If hsha is selected then it is essential
+#          that rpc_max_threads is changed from the default value of unlimited.
+#
+# The default is sync because on Windows hsha is about 30% slower.  On Linux,
+# sync/hsha performance is about the same, with hsha of course using less memory.
+#
+# Alternatively,  can provide your own RPC server by providing the fully-qualified class name
+# of an o.a.c.t.TServerFactory that can create an instance of it.
+rpc_server_type: sync
+
+# Uncomment rpc_min|max_thread to set request pool size limits.
+#
+# Regardless of your choice of RPC server (see above), the number of maximum requests in the
+# RPC thread pool dictates how many concurrent requests are possible (but if you are using the sync
+# RPC server, it also dictates the number of clients that can be connected at all).
+#
+# The default is unlimited and thus provides no protection against clients overwhelming the server. You are
+# encouraged to set a maximum that makes sense for you in production, but do keep in mind that
+# rpc_max_threads represents the maximum number of client requests this server may execute concurrently.
+#
+# rpc_min_threads: 16
+# rpc_max_threads: 2048
+
+# uncomment to set socket buffer sizes on rpc connections
+# rpc_send_buff_size_in_bytes:
+# rpc_recv_buff_size_in_bytes:
+
+# Uncomment to set socket buffer size for internode communication
+# Note that when setting this, the buffer size is limited by net.core.wmem_max
+# and when not setting it it is defined by net.ipv4.tcp_wmem
+# See:
+# /proc/sys/net/core/wmem_max
+# /proc/sys/net/core/rmem_max
+# /proc/sys/net/ipv4/tcp_wmem
+# /proc/sys/net/ipv4/tcp_wmem
+# and: man tcp
+# internode_send_buff_size_in_bytes:
+# internode_recv_buff_size_in_bytes:
+
+# Frame size for thrift (maximum message length).
+thrift_framed_transport_size_in_mb: 15
+
+# Set to true to have Cassandra create a hard link to each sstable
+# flushed or streamed locally in a backups/ subdirectory of the
+# keyspace data.  Removing these links is the operator's
+# responsibility.
+incremental_backups: false
+
+# Whether or not to take a snapshot before each compaction.  Be
+# careful using this option, since Cassandra won't clean up the
+# snapshots for you.  Mostly useful if you're paranoid when there
+# is a data format change.
+snapshot_before_compaction: false
+
+# Whether or not a snapshot is taken of the data before keyspace truncation
+# or dropping of column families. The STRONGLY advised default of true 
+# should be used to provide data safety. If you set this flag to false, you will
+# lose data on truncation or drop.
+auto_snapshot: true
+
+# Granularity of the collation index of rows within a partition.
+# Increase if your rows are large, or if you have a very large
+# number of rows per partition.  The competing goals are these:
+#   1) a smaller granularity means more index entries are generated
+#      and looking up rows withing the partition by collation column
+#      is faster
+#   2) but, Cassandra will keep the collation index in memory for hot
+#      rows (as part of the key cache), so a larger granularity means
+#      you can cache more hot rows
+column_index_size_in_kb: 64
+# Per sstable indexed key cache entries (the collation index in memory
+# mentioned above) exceeding this size will not be held on heap.
+# This means that only partition information is held on heap and the
+# index entries are read from disk.
+#
+# Note that this size refers to the size of the
+# serialized index information and not the size of the partition.
+column_index_cache_size_in_kb: 2
+
+# Number of simultaneous compactions to allow, NOT including
+# validation "compactions" for anti-entropy repair.  Simultaneous
+# compactions can help preserve read performance in a mixed read/write
+# workload, by mitigating the tendency of small sstables to accumulate
+# during a single long running compactions. The default is usually
+# fine and if you experience problems with compaction running too
+# slowly or too fast, you should look at
+# compaction_throughput_mb_per_sec first.
+#
+# concurrent_compactors defaults to the smaller of (number of disks,
+# number of cores), with a minimum of 2 and a maximum of 8.
+# 
+# If your data directories are backed by SSD, you should increase this
+# to the number of cores.
+#concurrent_compactors: 1
+
+# Throttles compaction to the given total throughput across the entire
+# system. The faster you insert data, the faster you need to compact in
+# order to keep the sstable count down, but in general, setting this to
+# 16 to 32 times the rate you are inserting data is more than sufficient.
+# Setting this to 0 disables throttling. Note that this account for all types
+# of compaction, including validation compaction.
+compaction_throughput_mb_per_sec: 16
+
+# When compacting, the replacement sstable(s) can be opened before they
+# are completely written, and used in place of the prior sstables for
+# any range that has been written. This helps to smoothly transfer reads 
+# between the sstables, reducing page cache churn and keeping hot rows hot
+sstable_preemptive_open_interval_in_mb: 50
+
+# Throttles all outbound streaming file transfers on this node to the
+# given total throughput in Mbps. This is necessary because Cassandra does
+# mostly sequential IO when streaming data during bootstrap or repair, which
+# can lead to saturating the network connection and degrading rpc performance.
+# When unset, the default is 200 Mbps or 25 MB/s.
+# stream_throughput_outbound_megabits_per_sec: 200
+
+# Throttles all streaming file transfer between the datacenters,
+# this setting allows users to throttle inter dc stream throughput in addition
+# to throttling all network stream traffic as configured with
+# stream_throughput_outbound_megabits_per_sec
+# When unset, the default is 200 Mbps or 25 MB/s
+# inter_dc_stream_throughput_outbound_megabits_per_sec: 200
+
+# How long the coordinator should wait for read operations to complete
+read_request_timeout_in_ms: 5000
+# How long the coordinator should wait for seq or index scans to complete
+range_request_timeout_in_ms: 10000
+# How long the coordinator should wait for writes to complete
+write_request_timeout_in_ms: 2000
+# How long the coordinator should wait for counter writes to complete
+counter_write_request_timeout_in_ms: 5000
+# How long a coordinator should continue to retry a CAS operation
+# that contends with other proposals for the same row
+cas_contention_timeout_in_ms: 1000
+# How long the coordinator should wait for truncates to complete
+# (This can be much longer, because unless auto_snapshot is disabled
+# we need to flush first so we can snapshot before removing the data.)
+truncate_request_timeout_in_ms: 60000
+# The default timeout for other, miscellaneous operations
+request_timeout_in_ms: 10000
+
+# Enable operation timeout information exchange between nodes to accurately
+# measure request timeouts.  If disabled, replicas will assume that requests
+# were forwarded to them instantly by the coordinator, which means that
+# under overload conditions we will waste that much extra time processing 
+# already-timed-out requests.
+#
+# Warning: before enabling this property make sure to ntp is installed
+# and the times are synchronized between the nodes.
+cross_node_timeout: false
+
+# Set socket timeout for streaming operation.
+# The stream session is failed if no data/ack is received by any of the participants
+# within that period, which means this should also be sufficient to stream a large
+# sstable or rebuild table indexes.
+# Default value is 86400000ms, which means stale streams timeout after 24 hours.
+# A value of zero means stream sockets should never time out.
+# streaming_socket_timeout_in_ms: 86400000
+
+# phi value that must be reached for a host to be marked down.
+# most users should never need to adjust this.
+# phi_convict_threshold: 8
+
+# endpoint_snitch -- Set this to a class that implements
+# IEndpointSnitch.  The snitch has two functions:
+# - it teaches Cassandra enough about your network topology to route
+#   requests efficiently
+# - it allows Cassandra to spread replicas around your cluster to avoid
+#   correlated failures. It does this by grouping machines into
+#   "datacenters" and "racks."  Cassandra will do its best not to have
+#   more than one replica on the same "rack" (which may not actually
+#   be a physical location)
+#
+# IF YOU CHANGE THE SNITCH AFTER DATA IS INSERTED INTO THE CLUSTER,
+# YOU MUST RUN A FULL REPAIR, SINCE THE SNITCH AFFECTS WHERE REPLICAS
+# ARE PLACED.
+#
+# IF THE RACK A REPLICA IS PLACED IN CHANGES AFTER THE REPLICA HAS BEEN
+# ADDED TO A RING, THE NODE MUST BE DECOMMISSIONED AND REBOOTSTRAPPED.
+#
+# Out of the box, Cassandra provides
+#  - SimpleSnitch:
+#    Treats Strategy order as proximity. This can improve cache
+#    locality when disabling read repair.  Only appropriate for
+#    single-datacenter deployments.
+#  - GossipingPropertyFileSnitch
+#    This should be your go-to snitch for production use.  The rack
+#    and datacenter for the local node are defined in
+#    cassandra-rackdc.properties and propagated to other nodes via
+#    gossip.  If cassandra-topology.properties exists, it is used as a
+#    fallback, allowing migration from the PropertyFileSnitch.
+#  - PropertyFileSnitch:
+#    Proximity is determined by rack and data center, which are
+#    explicitly configured in cassandra-topology.properties.
+#  - Ec2Snitch:
+#    Appropriate for EC2 deployments in a single Region. Loads Region
+#    and Availability Zone information from the EC2 API. The Region is
+#    treated as the datacenter, and the Availability Zone as the rack.
+#    Only private IPs are used, so this will not work across multiple
+#    Regions.
+#  - Ec2MultiRegionSnitch:
+#    Uses public IPs as broadcast_address to allow cross-region
+#    connectivity.  (Thus, you should set seed addresses to the public
+#    IP as well.) You will need to open the storage_port or
+#    ssl_storage_port on the public IP firewall.  (For intra-Region
+#    traffic, Cassandra will switch to the private IP after
+#    establishing a connection.)
+#  - RackInferringSnitch:
+#    Proximity is determined by rack and data center, which are
+#    assumed to correspond to the 3rd and 2nd octet of each node's IP
+#    address, respectively.  Unless this happens to match your
+#    deployment conventions, this is best used as an example of
+#    writing a custom Snitch class and is provided in that spirit.
+#
+# You can use a custom Snitch by setting this to the full class name
+# of the snitch, which will be assumed to be on your classpath.
+endpoint_snitch: SimpleSnitch
+
+# controls how often to perform the more expensive part of host score
+# calculation
+dynamic_snitch_update_interval_in_ms: 100 
+# controls how often to reset all host scores, allowing a bad host to
+# possibly recover
+dynamic_snitch_reset_interval_in_ms: 600000
+# if set greater than zero and read_repair_chance is < 1.0, this will allow
+# 'pinning' of replicas to hosts in order to increase cache capacity.
+# The badness threshold will control how much worse the pinned host has to be
+# before the dynamic snitch will prefer other replicas over it.  This is
+# expressed as a double which represents a percentage.  Thus, a value of
+# 0.2 means Cassandra would continue to prefer the static snitch values
+# until the pinned host was 20% worse than the fastest.
+dynamic_snitch_badness_threshold: 0.1
+
+# request_scheduler -- Set this to a class that implements
+# RequestScheduler, which will schedule incoming client requests
+# according to the specific policy. This is useful for multi-tenancy
+# with a single Cassandra cluster.
+# NOTE: This is specifically for requests from the client and does
+# not affect inter node communication.
+# org.apache.cassandra.scheduler.NoScheduler - No scheduling takes place
+# org.apache.cassandra.scheduler.RoundRobinScheduler - Round robin of
+# client requests to a node with a separate queue for each
+# request_scheduler_id. The scheduler is further customized by
+# request_scheduler_options as described below.
+request_scheduler: org.apache.cassandra.scheduler.NoScheduler
+
+# Scheduler Options vary based on the type of scheduler
+# NoScheduler - Has no options
+# RoundRobin
+#  - throttle_limit -- The throttle_limit is the number of in-flight
+#                      requests per client.  Requests beyond 
+#                      that limit are queued up until
+#                      running requests can complete.
+#                      The value of 80 here is twice the number of
+#                      concurrent_reads + concurrent_writes.
+#  - default_weight -- default_weight is optional and allows for
+#                      overriding the default which is 1.
+#  - weights -- Weights are optional and will default to 1 or the
+#               overridden default_weight. The weight translates into how
+#               many requests are handled during each turn of the
+#               RoundRobin, based on the scheduler id.
+#
+# request_scheduler_options:
+#    throttle_limit: 80
+#    default_weight: 5
+#    weights:
+#      Keyspace1: 1
+#      Keyspace2: 5
+
+# request_scheduler_id -- An identifier based on which to perform
+# the request scheduling. Currently the only valid option is keyspace.
+# request_scheduler_id: keyspace
+
+# Enable or disable inter-node encryption
+# JVM defaults for supported SSL socket protocols and cipher suites can
+# be replaced using custom encryption options. This is not recommended
+# unless you have policies in place that dictate certain settings, or
+# need to disable vulnerable ciphers or protocols in case the JVM cannot
+# be updated.
+# FIPS compliant settings can be configured at JVM level and should not
+# involve changing encryption settings here:
+# https://docs.oracle.com/javase/8/docs/technotes/guides/security/jsse/FIPS.html
+# NOTE: No custom encryption options are enabled at the moment
+# The available internode options are : all, none, dc, rack
+#
+# If set to dc cassandra will encrypt the traffic between the DCs
+# If set to rack cassandra will encrypt the traffic between the racks
+#
+# The passwords used in these options must match the passwords used when generating
+# the keystore and truststore.  For instructions on generating these files, see:
+# http://download.oracle.com/javase/6/docs/technotes/guides/security/jsse/JSSERefGuide.html#CreateKeystore
+#
+server_encryption_options:
+    internode_encryption: none
+    keystore: conf/.keystore
+    keystore_password: cassandra
+    truststore: conf/.truststore
+    truststore_password: cassandra
+    # More advanced defaults below:
+    # protocol: TLS
+    # algorithm: SunX509
+    # store_type: JKS
+    # cipher_suites: [TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA,TLS_DHE_RSA_WITH_AES_128_CBC_SHA,TLS_DHE_RSA_WITH_AES_256_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA]
+    # require_client_auth: false
+    # require_endpoint_verification: false
+
+# enable or disable client/server encryption.
+client_encryption_options:
+    enabled: false
+    # If enabled and optional is set to true encrypted and unencrypted connections are handled.
+    optional: false
+    keystore: conf/.keystore
+    keystore_password: cassandra
+    # require_client_auth: false
+    # Set trustore and truststore_password if require_client_auth is true
+    # truststore: conf/.truststore
+    # truststore_password: cassandra
+    # More advanced defaults below:
+    # protocol: TLS
+    # algorithm: SunX509
+    # store_type: JKS
+    # cipher_suites: [TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA,TLS_DHE_RSA_WITH_AES_128_CBC_SHA,TLS_DHE_RSA_WITH_AES_256_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA]
+
+# internode_compression controls whether traffic between nodes is
+# compressed.
+# can be:  all  - all traffic is compressed
+#          dc   - traffic between different datacenters is compressed
+#          none - nothing is compressed.
+internode_compression: dc
+
+# Enable or disable tcp_nodelay for inter-dc communication.
+# Disabling it will result in larger (but fewer) network packets being sent,
+# reducing overhead from the TCP protocol itself, at the cost of increasing
+# latency if you block for cross-datacenter responses.
+inter_dc_tcp_nodelay: false
+
+# TTL for different trace types used during logging of the repair process.
+tracetype_query_ttl: 86400
+tracetype_repair_ttl: 604800
+
+# UDFs (user defined functions) are disabled by default.
+# As of Cassandra 3.0 there is a sandbox in place that should prevent execution of evil code.
+enable_user_defined_functions: false
+
+# Enables scripted UDFs (JavaScript UDFs).
+# Java UDFs are always enabled, if enable_user_defined_functions is true.
+# Enable this option to be able to use UDFs with "language javascript" or any custom JSR-223 provider.
+# This option has no effect, if enable_user_defined_functions is false.
+enable_scripted_user_defined_functions: false
+
+# The default Windows kernel timer and scheduling resolution is 15.6ms for power conservation.
+# Lowering this value on Windows can provide much tighter latency and better throughput, however
+# some virtualized environments may see a negative performance impact from changing this setting
+# below their system default. The sysinternals 'clockres' tool can confirm your system's default
+# setting.
+windows_timer_interval: 1
+
+
+# Enables encrypting data at-rest (on disk). Different key providers can be plugged in, but the default reads from
+# a JCE-style keystore. A single keystore can hold multiple keys, but the one referenced by
+# the "key_alias" is the only key that will be used for encrypt opertaions; previously used keys
+# can still (and should!) be in the keystore and will be used on decrypt operations
+# (to handle the case of key rotation).
+#
+# It is strongly recommended to download and install Java Cryptography Extension (JCE)
+# Unlimited Strength Jurisdiction Policy Files for your version of the JDK.
+# (current link: http://www.oracle.com/technetwork/java/javase/downloads/jce8-download-2133166.html)
+#
+# Currently, only the following file types are supported for transparent data encryption, although
+# more are coming in future cassandra releases: commitlog, hints
+transparent_data_encryption_options:
+    enabled: false
+    chunk_length_kb: 64
+    cipher: AES/CBC/PKCS5Padding
+    key_alias: testing:1
+    # CBC IV length for AES needs to be 16 bytes (which is also the default size)
+    # iv_length: 16
+    key_provider: 
+      - class_name: org.apache.cassandra.security.JKSKeyProvider
+        parameters: 
+          - keystore: conf/.keystore
+            keystore_password: cassandra
+            store_type: JCEKS
+            key_password: cassandra
+
+
+#####################
+# SAFETY THRESHOLDS #
+#####################
+
+# When executing a scan, within or across a partition, we need to keep the
+# tombstones seen in memory so we can return them to the coordinator, which
+# will use them to make sure other replicas also know about the deleted rows.
+# With workloads that generate a lot of tombstones, this can cause performance
+# problems and even exaust the server heap.
+# (http://www.datastax.com/dev/blog/cassandra-anti-patterns-queues-and-queue-like-datasets)
+# Adjust the thresholds here if you understand the dangers and want to
+# scan more tombstones anyway.  These thresholds may also be adjusted at runtime
+# using the StorageService mbean.
+tombstone_warn_threshold: 1000
+tombstone_failure_threshold: 100000
+
+# Log WARN on any batch size exceeding this value. 5kb per batch by default.
+# Caution should be taken on increasing the size of this threshold as it can lead to node instability.
+batch_size_warn_threshold_in_kb: 5
+
+# Fail any batch exceeding this value. 50kb (10x warn threshold) by default.
+batch_size_fail_threshold_in_kb: 50
+
+# Log WARN on any batches not of type LOGGED than span across more partitions than this limit
+unlogged_batch_across_partitions_warn_threshold: 10
+
+# Log a warning when compacting partitions larger than this value
+compaction_large_partition_warning_threshold_mb: 100
+
+# GC Pauses greater than gc_warn_threshold_in_ms will be logged at WARN level
+# Adjust the threshold based on your application throughput requirement
+# By default, Cassandra logs GC Pauses greater than 200 ms at INFO level
+gc_warn_threshold_in_ms: 1000
+
+# Maximum size of any value in SSTables. Safety measure to detect SSTable corruption
+# early. Any value size larger than this threshold will result into marking an SSTable
+# as corrupted.
+# max_value_size_in_mb: 256
+

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/cassandra/SmallITCluster/cassandra-svc-rc.yaml
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/cassandra/SmallITCluster/cassandra-svc-rc.yaml b/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/cassandra/SmallITCluster/cassandra-svc-rc.yaml
new file mode 100644
index 0000000..7c36e34
--- /dev/null
+++ b/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/cassandra/SmallITCluster/cassandra-svc-rc.yaml
@@ -0,0 +1,88 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Headless service that allows us to get the IP addresses of our Cassandra nodes
+apiVersion: v1
+kind: Service
+metadata:
+  labels:
+    name: cassandra-peers
+  name: cassandra-peers
+spec:
+  clusterIP: None
+  ports:
+    - port: 7000
+      name: intra-node-communication
+    - port: 7001
+      name: tls-intra-node-communication
+  selector:
+    name: cassandra
+---
+# Kubernetes service file exposing Cassandra endpoint used by clients.
+apiVersion: v1
+kind: Service
+metadata:
+  labels:
+    name: cassandra
+  name: cassandra
+spec:
+  ports:
+    - port: 9042
+      name: cql
+  selector:
+    name: cassandra
+  type: LoadBalancer
+---
+# Replication Controller for Cassandra which tracks the Cassandra pods.
+apiVersion: v1
+kind: ReplicationController
+metadata:
+  labels:
+    name: cassandra
+  name: cassandra
+spec:
+  replicas: 1
+  selector:
+    name: cassandra
+  template:
+    metadata:
+      labels:
+        name: cassandra
+    spec:
+      containers:
+        - image: cassandra
+          name: cassandra
+          env:
+            - name: PEER_DISCOVERY_SERVICE
+              value: cassandra-peers
+            - name: CASSANDRA_CLUSTER_NAME
+              value: Cassandra
+            - name: CASSANDRA_DC
+              value: DC1
+            - name: CASSANDRA_RACK
+              value: Kubernetes Cluster
+# Number of tokens currently configured to 1. If this is not configured, default value is 256. You can change it as per requirement.			  
+            - name: CASSANDRA_NUM_TOKENS
+              value: '1'
+          ports:
+            - containerPort: 9042
+              name: cql
+          volumeMounts:
+            - mountPath: /var/lib/cassandra/data
+              name: data
+      volumes:
+        - name: data
+          emptyDir: {}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/cassandra/SmallITCluster/start-up.sh
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/cassandra/SmallITCluster/start-up.sh b/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/cassandra/SmallITCluster/start-up.sh
new file mode 100644
index 0000000..c05b771
--- /dev/null
+++ b/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/cassandra/SmallITCluster/start-up.sh
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/bin/bash
+set -e
+
+# Create Cassandra services and Replication controller.
+kubectl create -f cassandra-svc-rc.yaml

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/cassandra/SmallITCluster/teardown.sh
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/cassandra/SmallITCluster/teardown.sh b/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/cassandra/SmallITCluster/teardown.sh
new file mode 100644
index 0000000..f538a75
--- /dev/null
+++ b/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/cassandra/SmallITCluster/teardown.sh
@@ -0,0 +1,21 @@
+#
+#    Licensed to the Apache Software Foundation (ASF) under one or more
+#    contributor license agreements.  See the NOTICE file distributed with
+#    this work for additional information regarding copyright ownership.
+#    The ASF licenses this file to You under the Apache License, Version 2.0
+#    (the "License"); you may not use this file except in compliance with
+#    the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+#
+#!/bin/bash
+set -e
+
+# Delete Cassandra services and Replication controller.
+kubectl delete -f cassandra-svc-rc.yaml

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/cassandra/data-load-setup.sh
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/cassandra/data-load-setup.sh b/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/cassandra/data-load-setup.sh
new file mode 100644
index 0000000..4e12f89
--- /dev/null
+++ b/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/cassandra/data-load-setup.sh
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/bin/bash
+set -e
+
+# Load YCSB tool
+echo "Downloading YCSB tool"
+echo "------------------------------"
+curl -O --location https://github.com/brianfrankcooper/YCSB/releases/download/0.12.0/ycsb-0.12.0.tar.gz
+tar xfz ycsb-0.12.0.tar.gz
+wget https://www.slf4j.org/dist/slf4j-1.7.22.tar.gz
+tar xfz slf4j-1.7.22.tar.gz
+cp slf4j-1.7.22/slf4j-simple-*.jar ycsb-0.12.0/lib/
+cp slf4j-1.7.22/slf4j-api-*.jar ycsb-0.12.0/lib/
+echo "YCSB tool loaded"

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/cassandra/data-load.sh
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/cassandra/data-load.sh b/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/cassandra/data-load.sh
new file mode 100644
index 0000000..59d0e22
--- /dev/null
+++ b/sdks/java/io/hadoop/jdk1.8-tests/src/test/resources/kubernetes/cassandra/data-load.sh
@@ -0,0 +1,67 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/bin/bash
+set -e
+
+recordcount=1000
+# Identify the pod
+cassandra_pods="kubectl get pods -l name=cassandra"
+running_seed="$(kubectl get pods -o json -l name=cassandra -o jsonpath=\
+'{.items[0].metadata.name}')"
+echo "Detected Running Pod $running_seed"
+
+# After starting the service, it takes couple of minutes to generate the external IP for the
+# service. Hence, wait for sometime.
+
+# Identify external IP of the pod
+external_ip="$(kubectl get svc cassandra -o jsonpath='{.status.loadBalancer.ingress[0].ip}')"
+echo "Waiting for the Cassandra service to come up ........"
+while [ -z "$external_ip" ]
+do
+   sleep 10s
+   external_ip="$(kubectl get svc cassandra -o jsonpath='{.status.loadBalancer.ingress[0].ip}')"
+   echo "."
+done
+echo "External IP - $external_ip"
+
+# Create keyspace
+keyspace_creation_command="drop keyspace if exists ycsb;create keyspace ycsb WITH REPLICATION = {\
+'class' : 'SimpleStrategy', 'replication_factor': 3 };"
+kubectl exec -ti $running_seed -- cqlsh -e "$keyspace_creation_command"
+echo "Keyspace creation............"
+echo "-----------------------------"
+echo "$keyspace_creation_command"
+echo
+
+# Create table
+table_creation_command="use ycsb;drop table if exists usertable;create table usertable (\
+y_id varchar primary key,field0 varchar,field1 varchar,field2 varchar,field3 varchar,\
+field4 varchar,field5 varchar,field6 varchar,field7 varchar,field8 varchar,field9 varchar);"
+kubectl exec -ti $running_seed -- cqlsh -e "$table_creation_command"
+echo "Table creation .............."
+echo "-----------------------------"
+echo "$table_creation_command"
+
+cd ycsb-0.12.0
+
+echo "Starting to load data on ${external_ip}"
+echo "-----------------------------"
+# Record count set to 1000, change this value to load as per requirement.
+# dataintegrity flag is set to true to load deterministic data
+./bin/ycsb load cassandra-cql -p hosts=${external_ip} -p dataintegrity=true -p recordcount=\
+${recordcount} -p insertorder=ordered -p fieldlength=20 -P workloads/workloadd \
+-s > workloada_load_res.txt

[5/7] beam git commit: HadoopInputFormatIO with junits

Posted by da...@apache.org.

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop-input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/HadoopInputFormatIOTest.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop-input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/HadoopInputFormatIOTest.java b/sdks/java/io/hadoop-input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/HadoopInputFormatIOTest.java
deleted file mode 100644
index c25cf51..0000000
--- a/sdks/java/io/hadoop-input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/HadoopInputFormatIOTest.java
+++ /dev/null
@@ -1,844 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
- * agreements. See the NOTICE file distributed with this work for additional information regarding
- * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance with the License. You may obtain a
- * copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software distributed under the License
- * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
- * or implied. See the License for the specific language governing permissions and limitations under
- * the License.
- */
-package org.apache.beam.sdk.io.hadoop.inputformat;
-
-import static org.apache.beam.sdk.transforms.display.DisplayDataMatchers.hasDisplayItem;
-import static org.hamcrest.Matchers.containsInAnyOrder;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertThat;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map.Entry;
-
-import org.apache.beam.sdk.Pipeline.PipelineExecutionException;
-import org.apache.beam.sdk.coders.AvroCoder;
-import org.apache.beam.sdk.coders.Coder;
-import org.apache.beam.sdk.io.BoundedSource;
-import org.apache.beam.sdk.io.BoundedSource.BoundedReader;
-import org.apache.beam.sdk.io.hadoop.WritableCoder;
-import org.apache.beam.sdk.io.hadoop.inputformat.EmployeeInputFormat.EmployeeRecordReader;
-import org.apache.beam.sdk.io.hadoop.inputformat.EmployeeInputFormat.NewObjectsEmployeeInputSplit;
-import org.apache.beam.sdk.io.hadoop.inputformat.HadoopInputFormatIO.HadoopInputFormatBoundedSource;
-import org.apache.beam.sdk.io.hadoop.inputformat.HadoopInputFormatIO.SerializableConfiguration;
-import org.apache.beam.sdk.io.hadoop.inputformat.HadoopInputFormatIO.SerializableSplit;
-import org.apache.beam.sdk.testing.PAssert;
-import org.apache.beam.sdk.testing.SourceTestUtils;
-import org.apache.beam.sdk.testing.TestPipeline;
-import org.apache.beam.sdk.transforms.SimpleFunction;
-import org.apache.beam.sdk.transforms.display.DisplayData;
-import org.apache.beam.sdk.values.KV;
-import org.apache.beam.sdk.values.PBegin;
-import org.apache.beam.sdk.values.PCollection;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.MapWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.mapreduce.InputFormat;
-import org.apache.hadoop.mapreduce.InputSplit;
-import org.apache.hadoop.mapreduce.JobContext;
-import org.apache.hadoop.mapreduce.TaskAttemptContext;
-import org.junit.BeforeClass;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.ExpectedException;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-import org.mockito.Mockito;
-
-/**
- * Unit tests for {@link HadoopInputFormatIO}.
- */
-@RunWith(JUnit4.class)
-public class HadoopInputFormatIOTest {
-  static SerializableConfiguration serConf;
-  static SimpleFunction<Text, String> myKeyTranslate;
-  static SimpleFunction<Employee, String> myValueTranslate;
-
-  @Rule public final transient TestPipeline p = TestPipeline.create();
-  @Rule public ExpectedException thrown = ExpectedException.none();
-
-  private PBegin input = PBegin.in(p);
-
-  @BeforeClass
-  public static void setUp() throws IOException, InterruptedException {
-    serConf = loadTestConfiguration(
-                  EmployeeInputFormat.class,
-                  Text.class,
-                  Employee.class);
-    myKeyTranslate = new SimpleFunction<Text, String>() {
-      @Override
-      public String apply(Text input) {
-        return input.toString();
-      }
-    };
-    myValueTranslate = new SimpleFunction<Employee, String>() {
-      @Override
-      public String apply(Employee input) {
-        return input.getEmpName() + "_" + input.getEmpAddress();
-      }
-    };
-  }
-
-  @Test
-  public void testReadBuildsCorrectly() {
-    HadoopInputFormatIO.Read<String, String> read = HadoopInputFormatIO.<String, String>read()
-        .withConfiguration(serConf.getHadoopConfiguration())
-        .withKeyTranslation(myKeyTranslate)
-        .withValueTranslation(myValueTranslate);
-    assertEquals(serConf.getHadoopConfiguration(),
-        read.getConfiguration().getHadoopConfiguration());
-    assertEquals(myKeyTranslate, read.getKeyTranslationFunction());
-    assertEquals(myValueTranslate, read.getValueTranslationFunction());
-    assertEquals(myValueTranslate.getOutputTypeDescriptor(), read.getValueTypeDescriptor());
-    assertEquals(myKeyTranslate.getOutputTypeDescriptor(), read.getKeyTypeDescriptor());
-  }
-
-  /**
-   * This test validates {@link HadoopInputFormatIO.Read Read} builds correctly in different order
-   * of with configuration/key translation/value translation. This test also validates output
-   * PCollection key/value classes are set correctly even if Hadoop configuration is set after
-   * setting key/value translation.
-   */
-  @Test
-  public void testReadBuildsCorrectlyInDifferentOrder() {
-    HadoopInputFormatIO.Read<String, String> read =
-        HadoopInputFormatIO.<String, String>read()
-            .withValueTranslation(myValueTranslate)
-            .withConfiguration(serConf.getHadoopConfiguration())
-            .withKeyTranslation(myKeyTranslate);
-    assertEquals(serConf.getHadoopConfiguration(),
-        read.getConfiguration().getHadoopConfiguration());
-    assertEquals(myKeyTranslate, read.getKeyTranslationFunction());
-    assertEquals(myValueTranslate, read.getValueTranslationFunction());
-    assertEquals(myKeyTranslate.getOutputTypeDescriptor(), read.getKeyTypeDescriptor());
-    assertEquals(myValueTranslate.getOutputTypeDescriptor(), read.getValueTypeDescriptor());
-  }
-
-  /**
-   * This test validates {@link HadoopInputFormatIO.Read Read} object creation if
-   * {@link HadoopInputFormatIO.Read#withConfiguration() withConfiguration()} is called more than
-   * once.
-   * @throws InterruptedException
-   * @throws IOException
-   */
-  @Test
-  public void testReadBuildsCorrectlyIfWithConfigurationIsCalledMoreThanOneTime()
-      throws IOException, InterruptedException {
-    SerializableConfiguration diffConf =
-        loadTestConfiguration(
-            EmployeeInputFormat.class,
-            Employee.class,
-            Text.class);
-    HadoopInputFormatIO.Read<String, String> read = HadoopInputFormatIO.<String, String>read()
-        .withConfiguration(serConf.getHadoopConfiguration())
-        .withKeyTranslation(myKeyTranslate)
-        .withConfiguration(diffConf.getHadoopConfiguration());
-    assertEquals(diffConf.getHadoopConfiguration(),
-        read.getConfiguration().getHadoopConfiguration());
-    assertEquals(myKeyTranslate, read.getKeyTranslationFunction());
-    assertEquals(null, read.getValueTranslationFunction());
-    assertEquals(myKeyTranslate.getOutputTypeDescriptor(), read.getKeyTypeDescriptor());
-    assertEquals(diffConf.getHadoopConfiguration().getClass("value.class", Object.class), read
-        .getValueTypeDescriptor().getRawType());
-  }
-
-  /**
-   * This test validates functionality of {@link HadoopInputFormatIO.Read#populateDisplayData()
-   * populateDisplayData()}.
-   */
-  @Test
-  public void testReadDisplayData() {
-    HadoopInputFormatIO.Read<String, String> read = HadoopInputFormatIO.<String, String>read()
-        .withConfiguration(serConf.getHadoopConfiguration())
-        .withKeyTranslation(myKeyTranslate)
-        .withValueTranslation(myValueTranslate);
-    DisplayData displayData = DisplayData.from(read);
-    Iterator<Entry<String, String>> propertyElement = serConf.getHadoopConfiguration().iterator();
-    while (propertyElement.hasNext()) {
-      Entry<String, String> element = propertyElement.next();
-      assertThat(displayData, hasDisplayItem(element.getKey(), element.getValue()));
-    }
-  }
-
-  /**
-   * This test validates {@link HadoopInputFormatIO.Read Read} transform object creation fails with
-   * null configuration. {@link HadoopInputFormatIO.Read#withConfiguration() withConfiguration()}
-   * method checks configuration is null and throws exception if it is null.
-   */
-  @Test
-  public void testReadObjectCreationFailsIfConfigurationIsNull() {
-    thrown.expect(NullPointerException.class);
-    HadoopInputFormatIO.<Text, Employee>read()
-          .withConfiguration(null);
-  }
-
-  /**
-   * This test validates {@link HadoopInputFormatIO.Read Read} transform object creation with only
-   * configuration.
-   */
-  @Test
-  public void testReadObjectCreationWithConfiguration() {
-    HadoopInputFormatIO.Read<Text, Employee> read = HadoopInputFormatIO.<Text, Employee>read()
-        .withConfiguration(serConf.getHadoopConfiguration());
-    assertEquals(serConf.getHadoopConfiguration(),
-        read.getConfiguration().getHadoopConfiguration());
-    assertEquals(null, read.getKeyTranslationFunction());
-    assertEquals(null, read.getValueTranslationFunction());
-    assertEquals(serConf.getHadoopConfiguration().getClass("key.class", Object.class), read
-        .getKeyTypeDescriptor().getRawType());
-    assertEquals(serConf.getHadoopConfiguration().getClass("value.class", Object.class), read
-        .getValueTypeDescriptor().getRawType());
-  }
-
-  /**
-   * This test validates {@link HadoopInputFormatIO.Read Read} transform object creation fails with
-   * configuration and null key translation. {@link HadoopInputFormatIO.Read#withKeyTranslation()
-   * withKeyTranslation()} checks keyTranslation is null and throws exception if it null value is
-   * passed.
-   */
-  @Test
-  public void testReadObjectCreationFailsIfKeyTranslationFunctionIsNull() {
-    thrown.expect(NullPointerException.class);
-    HadoopInputFormatIO.<String, Employee>read()
-        .withConfiguration(serConf.getHadoopConfiguration())
-        .withKeyTranslation(null);
-  }
-
-  /**
-   * This test validates {@link HadoopInputFormatIO.Read Read} transform object creation with
-   * configuration and key translation.
-   */
-  @Test
-  public void testReadObjectCreationWithConfigurationKeyTranslation() {
-    HadoopInputFormatIO.Read<String, Employee> read = HadoopInputFormatIO.<String, Employee>read()
-        .withConfiguration(serConf.getHadoopConfiguration())
-        .withKeyTranslation(myKeyTranslate);
-    assertEquals(serConf.getHadoopConfiguration(),
-        read.getConfiguration().getHadoopConfiguration());
-    assertEquals(myKeyTranslate, read.getKeyTranslationFunction());
-    assertEquals(null, read.getValueTranslationFunction());
-    assertEquals(myKeyTranslate.getOutputTypeDescriptor().getRawType(),
-        read.getKeyTypeDescriptor().getRawType());
-    assertEquals(serConf.getHadoopConfiguration().getClass("value.class", Object.class),
-        read.getValueTypeDescriptor().getRawType());
-  }
-
-  /**
-   * This test validates {@link HadoopInputFormatIO.Read Read} transform object creation fails with
-   * configuration and null value translation.
-   * {@link HadoopInputFormatIO.Read#withValueTranslation() withValueTranslation()} checks
-   * valueTranslation is null and throws exception if null value is passed.
-   */
-  @Test
-  public void testReadObjectCreationFailsIfValueTranslationFunctionIsNull() {
-    thrown.expect(NullPointerException.class);
-    HadoopInputFormatIO.<Text, String>read()
-        .withConfiguration(serConf.getHadoopConfiguration())
-        .withValueTranslation(null);
-  }
-
-  /**
-   * This test validates {@link HadoopInputFormatIO.Read Read} transform object creation with
-   * configuration and value translation.
-   */
-  @Test
-  public void testReadObjectCreationWithConfigurationValueTranslation() {
-    HadoopInputFormatIO.Read<Text, String> read = HadoopInputFormatIO.<Text, String>read()
-        .withConfiguration(serConf.getHadoopConfiguration())
-        .withValueTranslation(myValueTranslate);
-    assertEquals(serConf.getHadoopConfiguration(),
-        read.getConfiguration().getHadoopConfiguration());
-    assertEquals(null, read.getKeyTranslationFunction());
-    assertEquals(myValueTranslate, read.getValueTranslationFunction());
-    assertEquals(serConf.getHadoopConfiguration().getClass("key.class", Object.class),
-        read.getKeyTypeDescriptor().getRawType());
-    assertEquals(myValueTranslate.getOutputTypeDescriptor().getRawType(),
-        read.getValueTypeDescriptor().getRawType());
-  }
-
-  /**
-   * This test validates {@link HadoopInputFormatIO.Read Read} transform object creation with
-   * configuration, key translation and value translation.
-   */
-  @Test
-  public void testReadObjectCreationWithConfigurationKeyTranslationValueTranslation() {
-    HadoopInputFormatIO.Read<String, String> read = HadoopInputFormatIO.<String, String>read()
-        .withConfiguration(serConf.getHadoopConfiguration())
-        .withKeyTranslation(myKeyTranslate)
-        .withValueTranslation(myValueTranslate);
-    assertEquals(serConf.getHadoopConfiguration(),
-        read.getConfiguration().getHadoopConfiguration());
-    assertEquals(myKeyTranslate, read.getKeyTranslationFunction());
-    assertEquals(myValueTranslate, read.getValueTranslationFunction());
-    assertEquals(myKeyTranslate.getOutputTypeDescriptor().getRawType(),
-        read.getKeyTypeDescriptor().getRawType());
-    assertEquals(myValueTranslate.getOutputTypeDescriptor().getRawType(),
-        read.getValueTypeDescriptor().getRawType());
-  }
-
-  /**
-   * This test validates functionality of {@link HadoopInputFormatIO.Read#validate()
-   * Read.validate()} function when Read transform is created without calling
-   * {@link HadoopInputFormatIO.Read#withConfiguration() withConfiguration()}.
-   */
-  @Test
-  public void testReadValidationFailsMissingConfiguration() {
-    HadoopInputFormatIO.Read<String, String> read = HadoopInputFormatIO.<String, String>read();
-    thrown.expect(NullPointerException.class);
-    read.validate(input);
-  }
-
-  /**
-   * This test validates functionality of {@link HadoopInputFormatIO.Read#withConfiguration()
-   * withConfiguration()} function when Hadoop InputFormat class is not provided by the user in
-   * configuration.
-   */
-  @Test
-  public void testReadValidationFailsMissingInputFormatInConf() {
-    Configuration configuration = new Configuration();
-    configuration.setClass("key.class", Text.class, Object.class);
-    configuration.setClass("value.class", Employee.class, Object.class);
-    thrown.expect(NullPointerException.class);
-    HadoopInputFormatIO.<Text, Employee>read()
-        .withConfiguration(configuration);
-  }
-
-  /**
-   * This test validates functionality of {@link HadoopInputFormatIO.Read#withConfiguration()
-   * withConfiguration()} function when key class is not provided by the user in configuration.
-   */
-  @Test
-  public void testReadValidationFailsMissingKeyClassInConf() {
-    Configuration configuration = new Configuration();
-    configuration.setClass("mapreduce.job.inputformat.class", EmployeeInputFormat.class,
-        InputFormat.class);
-    configuration.setClass("value.class", Employee.class, Object.class);
-    thrown.expect(NullPointerException.class);
-    HadoopInputFormatIO.<Text, Employee>read()
-        .withConfiguration(configuration);
-  }
-
-  /**
-   * This test validates functionality of {@link HadoopInputFormatIO.Read#withConfiguration()
-   * withConfiguration()} function when value class is not provided by the user in configuration.
-   */
-  @Test
-  public void testReadValidationFailsMissingValueClassInConf() {
-    Configuration configuration = new Configuration();
-    configuration.setClass("mapreduce.job.inputformat.class", EmployeeInputFormat.class,
-        InputFormat.class);
-    configuration.setClass("key.class", Text.class, Object.class);
-    thrown.expect(NullPointerException.class);
-    HadoopInputFormatIO.<Text, Employee>read().withConfiguration(configuration);
-  }
-
-  /**
-   * This test validates functionality of {@link HadoopInputFormatIO.Read#validate()
-   * Read.validate()} function when myKeyTranslate's (simple function provided by user for key
-   * translation) input type is not same as Hadoop InputFormat's keyClass(Which is property set in
-   * configuration as "key.class").
-   */
-  @Test
-  public void testReadValidationFailsWithWrongInputTypeKeyTranslationFunction() {
-    SimpleFunction<LongWritable, String> myKeyTranslateWithWrongInputType =
-        new SimpleFunction<LongWritable, String>() {
-          @Override
-          public String apply(LongWritable input) {
-            return input.toString();
-          }
-        };
-    HadoopInputFormatIO.Read<String, Employee> read = HadoopInputFormatIO.<String, Employee>read()
-        .withConfiguration(serConf.getHadoopConfiguration())
-        .withKeyTranslation(myKeyTranslateWithWrongInputType);
-    thrown.expect(IllegalArgumentException.class);
-    thrown.expectMessage(String.format(
-        "Key translation's input type is not same as hadoop InputFormat : %s key " + "class : %s",
-        serConf.getHadoopConfiguration().getClass("mapreduce.job.inputformat.class",
-            InputFormat.class), serConf.getHadoopConfiguration()
-            .getClass("key.class", Object.class)));
-    read.validate(input);
-  }
-
-  /**
-   * This test validates functionality of {@link HadoopInputFormatIO.Read#validate()
-   * Read.validate()} function when myValueTranslate's (simple function provided by user for value
-   * translation) input type is not same as Hadoop InputFormat's valueClass(Which is property set in
-   * configuration as "value.class").
-   */
-  @Test
-  public void testReadValidationFailsWithWrongInputTypeValueTranslationFunction() {
-    SimpleFunction<LongWritable, String> myValueTranslateWithWrongInputType =
-        new SimpleFunction<LongWritable, String>() {
-          @Override
-          public String apply(LongWritable input) {
-            return input.toString();
-          }
-        };
-    HadoopInputFormatIO.Read<Text, String> read =
-        HadoopInputFormatIO.<Text, String>read()
-            .withConfiguration(serConf.getHadoopConfiguration())
-            .withValueTranslation(myValueTranslateWithWrongInputType);
-    String expectedMessage =
-        String.format(
-            "Value translation's input type is not same as hadoop InputFormat :  "
-                + "%s value class : %s",
-            serConf.getHadoopConfiguration().getClass("mapreduce.job.inputformat.class",
-                InputFormat.class),
-            serConf.getHadoopConfiguration().getClass("value.class", Object.class));
-    thrown.expect(IllegalArgumentException.class);
-    thrown.expectMessage(expectedMessage);
-    read.validate(input);
-  }
-
-  /**
-   * This test validates reading from Hadoop InputFormat if wrong key class is set in
-   * configuration.
-   */
-  @Test
-  public void testReadFailsWithWrongKeyClass() {
-    SerializableConfiguration wrongConf = loadTestConfiguration(
-       EmployeeInputFormat.class,
-       MapWritable.class, // Actual key class is Text.class.
-       Employee.class);
-    HadoopInputFormatIO.Read<Text, String> read = HadoopInputFormatIO.<Text, String>read()
-        .withConfiguration(wrongConf.getHadoopConfiguration());
-    String expectedMessage =
-        String.format("java.lang.IllegalArgumentException: " + "Wrong InputFormat key class in "
-            + "configuration : Expected key.class is %s but was %s.", Text.class.getName(),
-            MapWritable.class.getName());
-    thrown.expect(PipelineExecutionException.class);
-    thrown.expectMessage(expectedMessage);
-    p.apply("ReadTest", read);
-    p.run();
-  }
-
-  /**
-   * This test validates reading from Hadoop InputFormat if wrong value class is set in
-   * configuration.
-   */
-  @Test
-  public void testReadFailsWithWrongValueClass() {
-    SerializableConfiguration wrongConf = loadTestConfiguration(
-       EmployeeInputFormat.class,
-       Text.class,
-       MapWritable.class); // Actual value class is Employee.class.
-    HadoopInputFormatIO.Read<Text, MapWritable> read = HadoopInputFormatIO.<Text, MapWritable>read()
-        .withConfiguration(wrongConf.getHadoopConfiguration());
-    String expectedMessage =
-        String.format("java.lang.IllegalArgumentException: "
-            + "Wrong InputFormat value class in configuration : "
-            + "Expected value.class is %s but was %s.", Employee.class.getName(),
-            MapWritable.class.getName());
-    thrown.expect(PipelineExecutionException.class);
-    thrown.expectMessage(expectedMessage);
-    p.apply("ReadTest", read);
-    p.run();
-  }
-
-  @Test
-  public void testReadingData() throws Exception {
-    HadoopInputFormatIO.Read<Text, Employee> read = HadoopInputFormatIO.<Text, Employee>read()
-        .withConfiguration(serConf.getHadoopConfiguration());
-    List<KV<Text, Employee>> expected = TestEmployeeDataSet.getEmployeeData();
-    PCollection<KV<Text, Employee>> actual = p.apply("ReadTest", read);
-    PAssert.that(actual).containsInAnyOrder(expected);
-    p.run();
-  }
-
-  /**
-   * This test validates behavior of {@link HadoopInputFormatBoundedSource} if RecordReader object
-   * creation fails.
-   */
-  @Test
-  public void testReadIfCreateRecordReaderFails() throws Exception {
-    thrown.expect(Exception.class);
-    thrown.expectMessage("Exception in creating RecordReader");
-    InputFormat<Text, Employee> mockInputFormat = Mockito.mock(EmployeeInputFormat.class);
-    Mockito.when(
-        mockInputFormat.createRecordReader(Mockito.any(InputSplit.class),
-            Mockito.any(TaskAttemptContext.class))).thenThrow(
-        new IOException("Exception in creating RecordReader"));
-    HadoopInputFormatBoundedSource<Text, Employee> boundedSource =
-        new HadoopInputFormatBoundedSource<Text, Employee>(
-            serConf,
-            WritableCoder.of(Text.class),
-            AvroCoder.of(Employee.class),
-            null, // No key translation required.
-            null, // No value translation required.
-            new SerializableSplit());
-    boundedSource.setInputFormatObj(mockInputFormat);
-    SourceTestUtils.readFromSource(boundedSource, p.getOptions());
-  }
-
-  /**
-   * This test validates behavior of HadoopInputFormatSource if
-   * {@link InputFormat#createRecordReader() createRecordReader()} of InputFormat returns null.
-   */
-  @Test
-  public void testReadWithNullCreateRecordReader() throws Exception {
-    InputFormat<Text, Employee> mockInputFormat = Mockito.mock(EmployeeInputFormat.class);
-    thrown.expect(IOException.class);
-    thrown.expectMessage(String.format("Null RecordReader object returned by %s",
-            mockInputFormat.getClass()));
-    Mockito.when(
-        mockInputFormat.createRecordReader(Mockito.any(InputSplit.class),
-            Mockito.any(TaskAttemptContext.class))).thenReturn(null);
-    HadoopInputFormatBoundedSource<Text, Employee> boundedSource =
-        new HadoopInputFormatBoundedSource<Text, Employee>(
-            serConf,
-            WritableCoder.of(Text.class),
-            AvroCoder.of(Employee.class),
-            null, // No key translation required.
-            null, // No value translation required.
-            new SerializableSplit());
-    boundedSource.setInputFormatObj(mockInputFormat);
-    SourceTestUtils.readFromSource(boundedSource, p.getOptions());
-  }
-
-  /**
-   * This test validates behavior of
-   * {@link HadoopInputFormatBoundedSource.HadoopInputFormatReader#start() start()} method if
-   * InputFormat's {@link InputFormat#getSplits() getSplits()} returns InputSplitList having zero
-   * records.
-   */
-  @Test
-  public void testReadersStartWhenZeroRecords() throws Exception {
-    InputFormat<Text, Employee> mockInputFormat = Mockito.mock(EmployeeInputFormat.class);
-    EmployeeRecordReader mockReader = Mockito.mock(EmployeeRecordReader.class);
-    Mockito.when(
-        mockInputFormat.createRecordReader(Mockito.any(InputSplit.class),
-            Mockito.any(TaskAttemptContext.class))).thenReturn(mockReader);
-    Mockito.when(mockReader.nextKeyValue()).thenReturn(false);
-    InputSplit mockInputSplit = Mockito.mock(NewObjectsEmployeeInputSplit.class);
-    HadoopInputFormatBoundedSource<Text, Employee> boundedSource =
-        new HadoopInputFormatBoundedSource<Text, Employee>(
-            serConf,
-            WritableCoder.of(Text.class),
-            AvroCoder.of(Employee.class),
-            null, // No key translation required.
-            null, // No value translation required.
-            new SerializableSplit(mockInputSplit));
-    BoundedReader<KV<Text, Employee>> boundedReader = boundedSource.createReader(p.getOptions());
-    assertEquals(false, boundedReader.start());
-    assertEquals(Double.valueOf(1), boundedReader.getFractionConsumed());
-  }
-
-  /**
-   * This test validates the method getFractionConsumed()- which indicates the progress of the read
-   * in range of 0 to 1.
-   */
-  @Test
-  public void testReadersGetFractionConsumed() throws Exception {
-    List<KV<Text, Employee>> referenceRecords = TestEmployeeDataSet.getEmployeeData();
-    HadoopInputFormatBoundedSource<Text, Employee> hifSource = getTestHIFSource(
-        EmployeeInputFormat.class,
-        Text.class,
-        Employee.class,
-        WritableCoder.of(Text.class),
-        AvroCoder.of(Employee.class));
-    long estimatedSize = hifSource.getEstimatedSizeBytes(p.getOptions());
-    // Validate if estimated size is equal to the size of records.
-    assertEquals(referenceRecords.size(), estimatedSize);
-    List<BoundedSource<KV<Text, Employee>>> boundedSourceList =
-        hifSource.splitIntoBundles(0, p.getOptions());
-    // Validate if splitIntoBundles() has split correctly.
-    assertEquals(TestEmployeeDataSet.NUMBER_OF_SPLITS, boundedSourceList.size());
-    List<KV<Text, Employee>> bundleRecords = new ArrayList<>();
-    for (BoundedSource<KV<Text, Employee>> source : boundedSourceList) {
-      List<KV<Text, Employee>> elements = new ArrayList<KV<Text, Employee>>();
-      BoundedReader<KV<Text, Employee>> reader = source.createReader(p.getOptions());
-      float recordsRead = 0;
-      // When start is not called, getFractionConsumed() should return 0.
-      assertEquals(Double.valueOf(0), reader.getFractionConsumed());
-      boolean start = reader.start();
-      assertEquals(true, start);
-      if (start) {
-        elements.add(reader.getCurrent());
-        boolean advance = reader.advance();
-        // Validate if getFractionConsumed() returns the correct fraction based on
-        // the number of records read in the split.
-        assertEquals(
-            Double.valueOf(++recordsRead / TestEmployeeDataSet.NUMBER_OF_RECORDS_IN_EACH_SPLIT),
-            reader.getFractionConsumed());
-        assertEquals(true, advance);
-        while (advance) {
-          elements.add(reader.getCurrent());
-          advance = reader.advance();
-          assertEquals(
-              Double.valueOf(++recordsRead / TestEmployeeDataSet.NUMBER_OF_RECORDS_IN_EACH_SPLIT),
-              reader.getFractionConsumed());
-        }
-        bundleRecords.addAll(elements);
-      }
-      // Validate if getFractionConsumed() returns 1 after reading is complete.
-      assertEquals(Double.valueOf(1), reader.getFractionConsumed());
-      reader.close();
-    }
-    assertThat(bundleRecords, containsInAnyOrder(referenceRecords.toArray()));
-  }
-
-  /**
-   * This test validates that reader and its parent source reads the same records.
-   */
-  @Test
-  public void testReaderAndParentSourceReadsSameData() throws Exception {
-    InputSplit mockInputSplit = Mockito.mock(NewObjectsEmployeeInputSplit.class);
-    HadoopInputFormatBoundedSource<Text, Employee> boundedSource =
-        new HadoopInputFormatBoundedSource<Text, Employee>(
-            serConf,
-            WritableCoder.of(Text.class),
-            AvroCoder.of(Employee.class),
-            null, // No key translation required.
-            null, // No value translation required.
-            new SerializableSplit(mockInputSplit));
-    BoundedReader<KV<Text, Employee>> reader = boundedSource
-        .createReader(p.getOptions());
-    SourceTestUtils.assertUnstartedReaderReadsSameAsItsSource(reader, p.getOptions());
-  }
-
-  /**
-   * This test verifies that the method
-   * {@link HadoopInputFormatBoundedSource.HadoopInputFormatReader#getCurrentSource()
-   * getCurrentSource()} returns correct source object.
-   */
-  @Test
-  public void testGetCurrentSourceFunction() throws Exception {
-    SerializableSplit split = new SerializableSplit();
-    BoundedSource<KV<Text, Employee>> source =
-        new HadoopInputFormatBoundedSource<Text, Employee>(
-            serConf,
-            WritableCoder.of(Text.class),
-            AvroCoder.of(Employee.class),
-            null, // No key translation required.
-            null, // No value translation required.
-            split);
-    BoundedReader<KV<Text, Employee>> hifReader = source.createReader(p.getOptions());
-    BoundedSource<KV<Text, Employee>> hifSource = hifReader.getCurrentSource();
-    assertEquals(hifSource, source);
-  }
-
-  /**
-   * This test validates behavior of {@link HadoopInputFormatBoundedSource#createReader()
-   * createReader()} method when {@link HadoopInputFormatBoundedSource#splitIntoBundles()
-   * splitIntoBundles()} is not called.
-   */
-  @Test
-  public void testCreateReaderIfSplitIntoBundlesNotCalled() throws Exception {
-    HadoopInputFormatBoundedSource<Text, Employee> hifSource = getTestHIFSource(
-        EmployeeInputFormat.class,
-        Text.class,
-        Employee.class,
-        WritableCoder.of(Text.class),
-        AvroCoder.of(Employee.class));
-    thrown.expect(IOException.class);
-    thrown.expectMessage("Cannot create reader as source is not split yet.");
-    hifSource.createReader(p.getOptions());
-  }
-
-  /**
-   * This test validates behavior of
-   * {@link HadoopInputFormatBoundedSource#computeSplitsIfNecessary() computeSplits()} when Hadoop
-   * InputFormat's {@link InputFormat#getSplits() getSplits()} returns empty list.
-   */
-  @Test
-  public void testComputeSplitsIfGetSplitsReturnsEmptyList() throws Exception {
-    InputFormat<?, ?> mockInputFormat = Mockito.mock(EmployeeInputFormat.class);
-    SerializableSplit mockInputSplit = Mockito.mock(SerializableSplit.class);
-    Mockito.when(mockInputFormat.getSplits(Mockito.any(JobContext.class))).thenReturn(
-        new ArrayList<InputSplit>());
-    HadoopInputFormatBoundedSource<Text, Employee> hifSource =
-        new HadoopInputFormatBoundedSource<Text, Employee>(
-            serConf,
-            WritableCoder.of(Text.class),
-            AvroCoder.of(Employee.class),
-            null, // No key translation required.
-            null, // No value translation required.
-            mockInputSplit);
-    thrown.expect(IOException.class);
-    thrown.expectMessage("Error in computing splits, getSplits() returns a empty list");
-    hifSource.setInputFormatObj(mockInputFormat);
-    hifSource.computeSplitsIfNecessary();
-  }
-
-  /**
-   * This test validates behavior of
-   * {@link HadoopInputFormatBoundedSource#computeSplitsIfNecessary() computeSplits()} when Hadoop
-   * InputFormat's {@link InputFormat#getSplits() getSplits()} returns NULL value.
-   */
-  @Test
-  public void testComputeSplitsIfGetSplitsReturnsNullValue() throws Exception {
-    InputFormat<Text, Employee> mockInputFormat = Mockito.mock(EmployeeInputFormat.class);
-    SerializableSplit mockInputSplit = Mockito.mock(SerializableSplit.class);
-    Mockito.when(mockInputFormat.getSplits(Mockito.any(JobContext.class))).thenReturn(null);
-    HadoopInputFormatBoundedSource<Text, Employee> hifSource =
-        new HadoopInputFormatBoundedSource<Text, Employee>(
-            serConf,
-            WritableCoder.of(Text.class),
-            AvroCoder.of(Employee.class),
-            null, // No key translation required.
-            null, // No value translation required.
-            mockInputSplit);
-    thrown.expect(IOException.class);
-    thrown.expectMessage("Error in computing splits, getSplits() returns null.");
-    hifSource.setInputFormatObj(mockInputFormat);
-    hifSource.computeSplitsIfNecessary();
-  }
-
-  /**
-   * This test validates behavior of
-   * {@link HadoopInputFormatBoundedSource#computeSplitsIfNecessary() computeSplits()} if Hadoop
-   * InputFormat's {@link InputFormat#getSplits() getSplits()} returns InputSplit list having some
-   * null values.
-   */
-  @Test
-  public void testComputeSplitsIfGetSplitsReturnsListHavingNullValues() throws Exception {
-    // InputSplit list having null value.
-    InputSplit mockInputSplit =
-        Mockito.mock(InputSplit.class, Mockito.withSettings().extraInterfaces(Writable.class));
-    List<InputSplit> inputSplitList = new ArrayList<InputSplit>();
-    inputSplitList.add(mockInputSplit);
-    inputSplitList.add(null);
-    InputFormat<Text, Employee> mockInputFormat = Mockito.mock(EmployeeInputFormat.class);
-    Mockito.when(mockInputFormat.getSplits(Mockito.any(JobContext.class))).thenReturn(
-        inputSplitList);
-    HadoopInputFormatBoundedSource<Text, Employee> hifSource =
-        new HadoopInputFormatBoundedSource<Text, Employee>(
-            serConf,
-            WritableCoder.of(Text.class),
-            AvroCoder.of(Employee.class),
-            null, // No key translation required.
-            null, // No value translation required.
-            new SerializableSplit());
-    thrown.expect(IOException.class);
-    thrown.expectMessage("Error in computing splits, split is null in InputSplits list populated "
-        + "by getSplits() : ");
-    hifSource.setInputFormatObj(mockInputFormat);
-    hifSource.computeSplitsIfNecessary();
-  }
-
-  /**
-   * This test validates records emitted in PCollection are immutable if InputFormat's recordReader
-   * returns same objects(i.e. same locations in memory) but with updated values for each record.
-   */
-  @Test
-  public void testImmutablityOfOutputOfReadIfRecordReaderObjectsAreMutable() throws Exception {
-    List<BoundedSource<KV<Text, Employee>>> boundedSourceList = getBoundedSourceList(
-       ReuseObjectsEmployeeInputFormat.class,
-       Text.class,
-       Employee.class,
-       WritableCoder.of(Text.class),
-       AvroCoder.of(Employee.class));
-    List<KV<Text, Employee>> bundleRecords = new ArrayList<>();
-    for (BoundedSource<KV<Text, Employee>> source : boundedSourceList) {
-      List<KV<Text, Employee>> elems = SourceTestUtils.readFromSource(source, p.getOptions());
-      bundleRecords.addAll(elems);
-    }
-    List<KV<Text, Employee>> referenceRecords = TestEmployeeDataSet.getEmployeeData();
-    assertThat(bundleRecords, containsInAnyOrder(referenceRecords.toArray()));
-  }
-
-  /**
-   * Test reading if InputFormat implements {@link org.apache.hadoop.conf.Configurable
-   * Configurable}.
-   */
-  @Test
-  public void testReadingWithConfigurableInputFormat() throws Exception {
-    List<BoundedSource<KV<Text, Employee>>> boundedSourceList = getBoundedSourceList(
-        ConfigurableEmployeeInputFormat.class,
-        Text.class,
-        Employee.class,
-        WritableCoder.of(Text.class),
-        AvroCoder.of(Employee.class));
-    for (BoundedSource<KV<Text, Employee>> source : boundedSourceList) {
-      // Cast to HadoopInputFormatBoundedSource to access getInputFormat().
-      @SuppressWarnings("unchecked")
-      HadoopInputFormatBoundedSource<Text, Employee> hifSource =
-          (HadoopInputFormatBoundedSource<Text, Employee>) source;
-      hifSource.createInputFormatInstance();
-      ConfigurableEmployeeInputFormat inputFormatObj =
-          (ConfigurableEmployeeInputFormat) hifSource.getInputFormat();
-      assertEquals(true, inputFormatObj.isConfSet);
-    }
-  }
-
-  /**
-   * This test validates records emitted in PCollection are immutable if InputFormat's
-   * {@link org.apache.hadoop.mapreduce.RecordReader RecordReader} returns different objects (i.e.
-   * different locations in memory).
-   */
-  @Test
-  public void testImmutablityOfOutputOfReadIfRecordReaderObjectsAreImmutable() throws Exception {
-   List<BoundedSource<KV<Text, Employee>>> boundedSourceList = getBoundedSourceList(
-       EmployeeInputFormat.class,
-       Text.class,
-       Employee.class,
-       WritableCoder.of(Text.class),
-       AvroCoder.of(Employee.class));
-    List<KV<Text, Employee>> bundleRecords = new ArrayList<>();
-    for (BoundedSource<KV<Text, Employee>> source : boundedSourceList) {
-      List<KV<Text, Employee>> elems = SourceTestUtils.readFromSource(source, p.getOptions());
-      bundleRecords.addAll(elems);
-    }
-    List<KV<Text, Employee>> referenceRecords = TestEmployeeDataSet.getEmployeeData();
-    assertThat(bundleRecords, containsInAnyOrder(referenceRecords.toArray()));
-  }
-
-  private static SerializableConfiguration loadTestConfiguration(Class<?> inputFormatClassName,
-      Class<?> keyClass, Class<?> valueClass) {
-    Configuration conf = new Configuration();
-    conf.setClass("mapreduce.job.inputformat.class", inputFormatClassName, InputFormat.class);
-    conf.setClass("key.class", keyClass, Object.class);
-    conf.setClass("value.class", valueClass, Object.class);
-    return new SerializableConfiguration(conf);
-  }
-
-  private <K, V> HadoopInputFormatBoundedSource<K, V> getTestHIFSource(
-      Class<?> inputFormatClass,
-      Class<K> inputFormatKeyClass,
-      Class<V> inputFormatValueClass,
-      Coder<K> keyCoder,
-      Coder<V> valueCoder){
-    SerializableConfiguration serConf =
-        loadTestConfiguration(
-            inputFormatClass,
-            inputFormatKeyClass,
-            inputFormatValueClass);
-    return new HadoopInputFormatBoundedSource<K, V>(
-            serConf,
-            keyCoder,
-            valueCoder,
-            null, // No key translation required.
-            null); // No value translation required.
-  }
-
-  private <K, V> List<BoundedSource<KV<K, V>>> getBoundedSourceList(
-      Class<?> inputFormatClass,
-      Class<K> inputFormatKeyClass,
-      Class<V> inputFormatValueClass,
-      Coder<K> keyCoder,
-      Coder<V> valueCoder) throws Exception{
-    HadoopInputFormatBoundedSource<K, V> boundedSource = getTestHIFSource(
-        inputFormatClass,
-        inputFormatKeyClass,
-        inputFormatValueClass,
-        keyCoder,
-        valueCoder);
-    return boundedSource.splitIntoBundles(0, p.getOptions());
-  }
-}

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop-input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/ReuseObjectsEmployeeInputFormat.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop-input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/ReuseObjectsEmployeeInputFormat.java b/sdks/java/io/hadoop-input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/ReuseObjectsEmployeeInputFormat.java
deleted file mode 100644
index fbe74ec..0000000
--- a/sdks/java/io/hadoop-input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/ReuseObjectsEmployeeInputFormat.java
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
- * agreements. See the NOTICE file distributed with this work for additional information regarding
- * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance with the License. You may obtain a
- * copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software distributed under the License
- * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
- * or implied. See the License for the specific language governing permissions and limitations under
- * the License.
- */
-package org.apache.beam.sdk.io.hadoop.inputformat;
-
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.beam.sdk.values.KV;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.mapreduce.InputFormat;
-import org.apache.hadoop.mapreduce.InputSplit;
-import org.apache.hadoop.mapreduce.JobContext;
-import org.apache.hadoop.mapreduce.RecordReader;
-import org.apache.hadoop.mapreduce.TaskAttemptContext;
-
-/**
- * This is a valid InputFormat for reading employee data which is available in the form of
- * {@code List<KV>} as {@linkplain ReuseObjectsEmployeeRecordReader#employeeDataList
- * employeeDataList}. {@linkplain ReuseObjectsEmployeeRecordReader#employeeDataList
- * employeeDataList} is populated using {@linkplain TestEmployeeDataSet#populateEmployeeDataNew()}.
- *
- * <p>{@linkplain ReuseObjectsEmployeeInputFormat} splits data into
- * {@value TestEmployeeDataSet#NUMBER_OF_SPLITS} splits, each split having
- * {@value TestEmployeeDataSet#NUMBER_OF_RECORDS_IN_EACH_SPLIT} records each.
- * {@linkplain ReuseObjectsEmployeeInputFormat} reads data from
- * {@linkplain ReuseObjectsEmployeeRecordReader#employeeDataList employeeDataList} and produces a
- * key (employee id) of type Text and value of type {@linkplain Employee Employee}.
- *
- * <p>{@linkplain ReuseObjectsEmployeeInputFormat} is also input to test whether
- * {@linkplain HadoopInputFormatIO } source returns immutable records for a scenario when
- * RecordReader returns the same key and value objects with updating values every time it reads
- * data.
- */
-public class ReuseObjectsEmployeeInputFormat extends InputFormat<Text, Employee> {
-
-  public ReuseObjectsEmployeeInputFormat() {}
-
-  @Override
-  public RecordReader<Text, Employee> createRecordReader(InputSplit split,
-      TaskAttemptContext context) throws IOException, InterruptedException {
-    return new ReuseObjectsEmployeeRecordReader();
-  }
-
-  @Override
-  public List<InputSplit> getSplits(JobContext arg0) throws IOException, InterruptedException {
-    List<InputSplit> inputSplitList = new ArrayList<InputSplit>();
-    for (int i = 1; i <= TestEmployeeDataSet.NUMBER_OF_SPLITS; i++) {
-      InputSplit inputSplitObj = new ReuseEmployeeInputSplit(
-          ((i - 1) * TestEmployeeDataSet.NUMBER_OF_RECORDS_IN_EACH_SPLIT),
-          (i * TestEmployeeDataSet.NUMBER_OF_RECORDS_IN_EACH_SPLIT - 1));
-      inputSplitList.add(inputSplitObj);
-    }
-    return inputSplitList;
-  }
-
-  /**
-   * InputSplit implementation for ReuseObjectsEmployeeInputFormat.
-   */
-  public class ReuseEmployeeInputSplit extends InputSplit implements Writable {
-    // Start and end map index of each split of employeeData.
-    private long startIndex;
-    private long endIndex;
-
-    public ReuseEmployeeInputSplit() {}
-
-    public ReuseEmployeeInputSplit(long startIndex, long endIndex) {
-      this.startIndex = startIndex;
-      this.endIndex = endIndex;
-    }
-
-    /** Returns number of records in each split. */
-    @Override
-    public long getLength() throws IOException, InterruptedException {
-      return this.endIndex - this.startIndex + 1;
-    }
-
-    @Override
-    public String[] getLocations() throws IOException, InterruptedException {
-      return null;
-    }
-
-
-    public long getStartIndex() {
-      return startIndex;
-    }
-
-    public long getEndIndex() {
-      return endIndex;
-    }
-
-    @Override
-    public void readFields(DataInput dataIn) throws IOException {
-      startIndex = dataIn.readLong();
-      endIndex = dataIn.readLong();
-    }
-
-    @Override
-    public void write(DataOutput dataOut) throws IOException {
-      dataOut.writeLong(startIndex);
-      dataOut.writeLong(endIndex);
-    }
-  }
-
-  /**
-   * RecordReader for ReuseObjectsEmployeeInputFormat.
-   */
-  public class ReuseObjectsEmployeeRecordReader extends RecordReader<Text, Employee> {
-
-    private ReuseEmployeeInputSplit split;
-    private Text currentKey = new Text();
-    private Employee currentValue = new Employee();
-    private long employeeListIndex = 0L;
-    private long recordsRead = 0L;
-    private List<KV<String, String>> employeeDataList;
-
-    public ReuseObjectsEmployeeRecordReader() {}
-
-    @Override
-    public void close() throws IOException {}
-
-    @Override
-    public Text getCurrentKey() throws IOException, InterruptedException {
-      return currentKey;
-    }
-
-    @Override
-    public Employee getCurrentValue() throws IOException, InterruptedException {
-      return currentValue;
-    }
-
-    @Override
-    public float getProgress() throws IOException, InterruptedException {
-      return (float) recordsRead / split.getLength();
-    }
-
-    @Override
-    public void initialize(InputSplit split, TaskAttemptContext arg1)
-        throws IOException, InterruptedException {
-      this.split = (ReuseEmployeeInputSplit) split;
-      employeeListIndex = this.split.getStartIndex() - 1;
-      recordsRead = 0;
-      employeeDataList = TestEmployeeDataSet.populateEmployeeData();
-    }
-
-    @Override
-    public boolean nextKeyValue() throws IOException, InterruptedException {
-      if ((recordsRead++) >= split.getLength()) {
-        return false;
-      }
-      employeeListIndex++;
-      KV<String, String> employeeDetails = employeeDataList.get((int) employeeListIndex);
-      String empData[] = employeeDetails.getValue().split("_");
-      // Updating the same key and value objects with new employee data.
-      currentKey.set(employeeDetails.getKey());
-      currentValue.setEmpName(empData[0]);
-      currentValue.setEmpAddress(empData[1]);
-      return true;
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop-input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/TestEmployeeDataSet.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop-input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/TestEmployeeDataSet.java b/sdks/java/io/hadoop-input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/TestEmployeeDataSet.java
deleted file mode 100644
index 4a8fe95..0000000
--- a/sdks/java/io/hadoop-input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/TestEmployeeDataSet.java
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
- * agreements. See the NOTICE file distributed with this work for additional information regarding
- * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance with the License. You may obtain a
- * copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software distributed under the License
- * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
- * or implied. See the License for the specific language governing permissions and limitations under
- * the License.
- */
-package org.apache.beam.sdk.io.hadoop.inputformat;
-
-import com.google.common.base.Function;
-import com.google.common.collect.Lists;
-
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.beam.sdk.values.KV;
-import org.apache.hadoop.io.Text;
-/**
- * Test Utils used in {@link EmployeeInputFormat} and {@link ReuseObjectsEmployeeInputFormat} for
- * computing splits.
- */
-public class TestEmployeeDataSet {
-  public static final long NUMBER_OF_RECORDS_IN_EACH_SPLIT = 5L;
-  public static final long NUMBER_OF_SPLITS = 3L;
-  private static final List<KV<String, String>> data = new ArrayList<KV<String, String>>();
-
-  /**
-   * Returns List of employee details. Employee details are available in the form of {@link KV} in
-   * which, key indicates employee id and value indicates employee details such as name and address
-   * separated by '_'. This is data input to {@link EmployeeInputFormat} and
-   * {@link ReuseObjectsEmployeeInputFormat}.
-   */
-  public static List<KV<String, String>> populateEmployeeData() {
-    if (!data.isEmpty()) {
-      return data;
-    }
-    data.add(KV.of("0", "Alex_US"));
-    data.add(KV.of("1", "John_UK"));
-    data.add(KV.of("2", "Tom_UK"));
-    data.add(KV.of("3", "Nick_UAE"));
-    data.add(KV.of("4", "Smith_IND"));
-    data.add(KV.of("5", "Taylor_US"));
-    data.add(KV.of("6", "Gray_UK"));
-    data.add(KV.of("7", "James_UAE"));
-    data.add(KV.of("8", "Jordan_IND"));
-    data.add(KV.of("9", "Leena_UK"));
-    data.add(KV.of("10", "Zara_UAE"));
-    data.add(KV.of("11", "Talia_IND"));
-    data.add(KV.of("12", "Rose_UK"));
-    data.add(KV.of("13", "Kelvin_UAE"));
-    data.add(KV.of("14", "Goerge_IND"));
-    return data;
-  }
-
-  /**
-   * This is a helper function used in unit tests for validating data against data read using
-   * {@link EmployeeInputFormat} and {@link ReuseObjectsEmployeeInputFormat}.
-   */
-  public static List<KV<Text, Employee>> getEmployeeData() {
-    return Lists.transform((data.isEmpty() ? populateEmployeeData() : data),
-        new Function<KV<String, String>, KV<Text, Employee>>() {
-          @Override
-          public KV<Text, Employee> apply(KV<String, String> input) {
-            String[] empData = input.getValue().split("_");
-            return KV.of(new Text(input.getKey()), new Employee(empData[0], empData[1]));
-          }
-        });
-  }
-}

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop/README.md
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop/README.md b/sdks/java/io/hadoop/README.md
new file mode 100644
index 0000000..d91f019
--- /dev/null
+++ b/sdks/java/io/hadoop/README.md
@@ -0,0 +1,167 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+-->
+
+# Hadoop InputFormat IO
+
+A HadoopInputFormatIO is a Transform for reading data from any source which
+implements Hadoop InputFormat. For example- Cassandra, Elasticsearch, HBase, Redis, Postgres, etc.
+
+HadoopInputFormatIO has to make several performance trade-offs in connecting to InputFormat, so if there is another Beam IO Transform specifically for connecting to your data source of choice, we would recommend using that one, but this IO Transform allows you to connect to many data sources that do not yet have a Beam IO Transform.
+
+You will need to pass a Hadoop Configuration with parameters specifying how the read will occur. Many properties of the Configuration are optional, and some are required for certain InputFormat classes, but the following properties must be set for all InputFormats:
+
+mapreduce.job.inputformat.class: The InputFormat class used to connect to your data source of choice.
+key.class: The key class returned by the InputFormat in 'mapreduce.job.inputformat.class'.
+value.class: The value class returned by the InputFormat in 'mapreduce.job.inputformat.class'.
+
+For example:
+```java
+Configuration myHadoopConfiguration = new Configuration(false);
+// Set Hadoop InputFormat, key and value class in configuration
+myHadoopConfiguration.setClass("mapreduce.job.inputformat.class", InputFormatClass,
+  InputFormat.class);
+myHadoopConfiguration.setClass("key.class", InputFormatKeyClass, Object.class);
+myHadoopConfiguration.setClass("value.class", InputFormatValueClass, Object.class);
+```
+
+You will need to check to see if the key and value classes output by the InputFormat have a Beam Coder available. If not, You can use withKeyTranslation/withValueTranslation to specify a method transforming instances of those classes into another class that is supported by a Beam Coder. These settings are optional and you don't need to specify translation for both key and value.
+
+For example:
+```java
+SimpleFunction<InputFormatKeyClass, MyKeyClass> myOutputKeyType =
+new SimpleFunction<InputFormatKeyClass, MyKeyClass>() {
+  public MyKeyClass apply(InputFormatKeyClass input) {
+  // ...logic to transform InputFormatKeyClass to MyKeyClass
+  }
+};
+SimpleFunction<InputFormatValueClass, MyValueClass> myOutputValueType =
+new SimpleFunction<InputFormatValueClass, MyValueClass>() {
+  public MyValueClass apply(InputFormatValueClass input) {
+  // ...logic to transform InputFormatValueClass to MyValueClass
+  }
+};
+```
+
+### Reading using Hadoop InputFormat IO
+Pipeline p = ...; // Create pipeline.
+// Read data only with Hadoop configuration.
+
+```java
+p.apply("read",
+  HadoopInputFormatIO.<InputFormatKeyClass, InputFormatKeyClass>read()
+  .withConfiguration(myHadoopConfiguration);
+```
+
+// Read data with configuration and key translation (Example scenario: Beam Coder is not
+available for key class hence key translation is required.).
+
+```java
+p.apply("read",
+  HadoopInputFormatIO.<MyKeyClass, InputFormatKeyClass>read()
+  .withConfiguration(myHadoopConfiguration)
+  .withKeyTranslation(myOutputKeyType);
+```
+
+// Read data with configuration and value translation (Example scenario: Beam Coder is not
+available for value class hence value translation is required.).
+
+```java
+p.apply("read",
+  HadoopInputFormatIO.<InputFormatKeyClass, MyValueClass>read()
+  .withConfiguration(myHadoopConfiguration)
+  .withValueTranslation(myOutputValueType);
+```
+
+// Read data with configuration, value translation and key translation (Example scenario: Beam Coders are not available for both key class and value class of InputFormat hence key and value translation is required.).
+
+```java
+p.apply("read",
+  HadoopInputFormatIO.<MyKeyClass, MyValueClass>read()
+  .withConfiguration(myHadoopConfiguration)
+  .withKeyTranslation(myOutputKeyType)
+  .withValueTranslation(myOutputValueType);
+```
+
+# Examples for specific InputFormats
+
+### Cassandra - CqlInputFormat
+
+To read data from Cassandra, org.apache.cassandra.hadoop.cql3.CqlInputFormat
+CqlInputFormat can be used which needs following properties to be set.
+
+Create Cassandra Hadoop configuration as follows:
+
+```java
+Configuration cassandraConf = new Configuration();
+cassandraConf.set("cassandra.input.thrift.port", "9160");
+cassandraConf.set("cassandra.input.thrift.address", CassandraHostIp);
+cassandraConf.set("cassandra.input.partitioner.class", "Murmur3Partitioner");
+cassandraConf.set("cassandra.input.keyspace", "myKeySpace");
+cassandraConf.set("cassandra.input.columnfamily", "myColumnFamily");
+cassandraConf.setClass("key.class", java.lang.Long Long.class, Object.class);
+cassandraConf.setClass("value.class", com.datastax.driver.core.Row Row.class, Object.class);
+cassandraConf.setClass("mapreduce.job.inputformat.class", org.apache.cassandra.hadoop.cql3.CqlInputFormat CqlInputFormat.class, InputFormat.class);
+```
+
+Call Read transform as follows:
+
+```java
+PCollection<KV<Long, String>> cassandraData =
+  p.apply("read",
+  HadoopInputFormatIO.<Long, String>read()
+  .withConfiguration(cassandraConf)
+  .withValueTranslation(cassandraOutputValueType);
+```
+
+The CqlInputFormat key class is java.lang.Long Long, which has a Beam Coder. The CqlInputFormat value class is com.datastax.driver.core.Row Row, which does not have a Beam Coder. Rather than write a new coder, you can provide your own translation method as follows:
+
+```java
+SimpleFunction<Row, String> cassandraOutputValueType = SimpleFunction<Row, String>()
+{
+  public String apply(Row row) {
+    return row.getString('myColName');
+  }
+};
+```
+ 
+### Elasticsearch - EsInputFormat
+ 
+To read data from Elasticsearch, EsInputFormat can be used which needs following properties to be set.
+ 
+Create ElasticSearch Hadoop configuration as follows:
+
+```java
+Configuration elasticSearchConf = new Configuration();
+elasticSearchConf.set("es.nodes", ElasticsearchHostIp);
+elasticSearchConf.set("es.port", "9200");
+elasticSearchConf.set("es.resource", "ElasticIndexName/ElasticTypeName");
+elasticSearchConf.setClass("key.class", org.apache.hadoop.io.Text Text.class, Object.class);
+elasticSearchConf.setClass("value.class", org.elasticsearch.hadoop.mr.LinkedMapWritable LinkedMapWritable.class, Object.class);
+elasticSearchConf.setClass("mapreduce.job.inputformat.class", org.elasticsearch.hadoop.mr.EsInputFormat EsInputFormat.class, InputFormat.class);
+```
+
+Call Read transform as follows:
+
+```java
+PCollection<KV<Text, LinkedMapWritable>> elasticData = p.apply("read",
+  HadoopInputFormatIO.<Text, LinkedMapWritable>read().withConfiguration(elasticSearchConf));
+```
+
+The org.elasticsearch.hadoop.mr.EsInputFormat EsInputFormat key class is
+org.apache.hadoop.io.Text Text and value class is org.elasticsearch.hadoop.mr.LinkedMapWritable LinkedMapWritable. Both key and value classes have Beam Coders.
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop/input-format/pom.xml
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop/input-format/pom.xml b/sdks/java/io/hadoop/input-format/pom.xml
new file mode 100644
index 0000000..9558ecd
--- /dev/null
+++ b/sdks/java/io/hadoop/input-format/pom.xml
@@ -0,0 +1,98 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    (the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.beam</groupId>
+    <artifactId>beam-sdks-java-io-hadoop-parent</artifactId>
+    <version>0.7.0-SNAPSHOT</version>
+    <relativePath>../pom.xml</relativePath>
+  </parent>
+  <artifactId>beam-sdks-java-io-hadoop-input-format</artifactId>
+  <name>Apache Beam :: SDKs :: Java :: IO :: Hadoop :: input-format</name>
+  <description>IO to read data from data sources which implement Hadoop Input Format.</description>
+
+  <properties>
+    <log4j.core.version>2.6.2</log4j.core.version>
+    <hadoop.common.version>2.7.0</hadoop.common.version>
+    <guava.version>19.0</guava.version>
+  </properties>
+
+  <dependencies>
+    <dependency>
+      <groupId>com.google.guava</groupId>
+      <artifactId>guava</artifactId>
+      <version>${guava.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-api</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>com.google.code.findbugs</groupId>
+      <artifactId>jsr305</artifactId>
+    </dependency>
+
+    <!-- compile dependencies -->
+    <dependency>
+      <groupId>com.google.auto.value</groupId>
+      <artifactId>auto-value</artifactId>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.beam</groupId>
+      <artifactId>beam-sdks-java-io-hadoop-common</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-common</artifactId>
+      <version>${hadoop.common.version}</version>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-mapreduce-client-core</artifactId>
+      <version>${hadoop.common.version}</version>
+      <scope>provided</scope>
+    </dependency>
+
+    <!-- test dependencies -->
+    <dependency>
+      <groupId>org.apache.beam</groupId>
+      <artifactId>beam-sdks-java-core</artifactId>
+      <classifier>tests</classifier>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.beam</groupId>
+      <artifactId>beam-runners-direct-java</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.logging.log4j</groupId>
+      <artifactId>log4j-core</artifactId>
+      <version>${log4j.core.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.hamcrest</groupId>
+      <artifactId>hamcrest-all</artifactId>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+</project>
\ No newline at end of file

[3/7] beam git commit: HadoopInputFormatIO with junits

Posted by da...@apache.org.

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop/input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/HadoopInputFormatIOTest.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop/input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/HadoopInputFormatIOTest.java b/sdks/java/io/hadoop/input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/HadoopInputFormatIOTest.java
new file mode 100644
index 0000000..2f2857b
--- /dev/null
+++ b/sdks/java/io/hadoop/input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/HadoopInputFormatIOTest.java
@@ -0,0 +1,797 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package org.apache.beam.sdk.io.hadoop.inputformat;
+
+import static org.apache.beam.sdk.transforms.display.DisplayDataMatchers.hasDisplayItem;
+import static org.hamcrest.Matchers.containsInAnyOrder;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertThat;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map.Entry;
+
+import org.apache.beam.sdk.coders.AvroCoder;
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.io.BoundedSource;
+import org.apache.beam.sdk.io.BoundedSource.BoundedReader;
+import org.apache.beam.sdk.io.hadoop.WritableCoder;
+import org.apache.beam.sdk.io.hadoop.inputformat.EmployeeInputFormat.EmployeeRecordReader;
+import org.apache.beam.sdk.io.hadoop.inputformat.EmployeeInputFormat.NewObjectsEmployeeInputSplit;
+import org.apache.beam.sdk.io.hadoop.inputformat.HadoopInputFormatIO.HadoopInputFormatBoundedSource;
+import org.apache.beam.sdk.io.hadoop.inputformat.HadoopInputFormatIO.SerializableConfiguration;
+import org.apache.beam.sdk.io.hadoop.inputformat.HadoopInputFormatIO.SerializableSplit;
+import org.apache.beam.sdk.testing.PAssert;
+import org.apache.beam.sdk.testing.SourceTestUtils;
+import org.apache.beam.sdk.testing.TestPipeline;
+import org.apache.beam.sdk.transforms.SimpleFunction;
+import org.apache.beam.sdk.transforms.display.DisplayData;
+import org.apache.beam.sdk.values.KV;
+import org.apache.beam.sdk.values.PBegin;
+import org.apache.beam.sdk.values.PCollection;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.InputFormat;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.junit.BeforeClass;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.ExpectedException;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+import org.mockito.Mockito;
+
+/**
+ * Unit tests for {@link HadoopInputFormatIO}.
+ */
+@RunWith(JUnit4.class)
+public class HadoopInputFormatIOTest {
+  static SerializableConfiguration serConf;
+  static SimpleFunction<Text, String> myKeyTranslate;
+  static SimpleFunction<Employee, String> myValueTranslate;
+
+  @Rule public final transient TestPipeline p = TestPipeline.create();
+  @Rule public ExpectedException thrown = ExpectedException.none();
+
+  private PBegin input = PBegin.in(p);
+
+  @BeforeClass
+  public static void setUp() throws IOException, InterruptedException {
+    serConf = loadTestConfiguration(
+                  EmployeeInputFormat.class,
+                  Text.class,
+                  Employee.class);
+    myKeyTranslate = new SimpleFunction<Text, String>() {
+      @Override
+      public String apply(Text input) {
+        return input.toString();
+      }
+    };
+    myValueTranslate = new SimpleFunction<Employee, String>() {
+      @Override
+      public String apply(Employee input) {
+        return input.getEmpName() + "_" + input.getEmpAddress();
+      }
+    };
+  }
+
+  @Test
+  public void testReadBuildsCorrectly() {
+    HadoopInputFormatIO.Read<String, String> read = HadoopInputFormatIO.<String, String>read()
+        .withConfiguration(serConf.getHadoopConfiguration())
+        .withKeyTranslation(myKeyTranslate)
+        .withValueTranslation(myValueTranslate);
+    assertEquals(serConf.getHadoopConfiguration(),
+        read.getConfiguration().getHadoopConfiguration());
+    assertEquals(myKeyTranslate, read.getKeyTranslationFunction());
+    assertEquals(myValueTranslate, read.getValueTranslationFunction());
+    assertEquals(myValueTranslate.getOutputTypeDescriptor(), read.getValueTypeDescriptor());
+    assertEquals(myKeyTranslate.getOutputTypeDescriptor(), read.getKeyTypeDescriptor());
+  }
+
+  /**
+   * This test validates {@link HadoopInputFormatIO.Read Read} builds correctly in different order
+   * of with configuration/key translation/value translation. This test also validates output
+   * PCollection key/value classes are set correctly even if Hadoop configuration is set after
+   * setting key/value translation.
+   */
+  @Test
+  public void testReadBuildsCorrectlyInDifferentOrder() {
+    HadoopInputFormatIO.Read<String, String> read =
+        HadoopInputFormatIO.<String, String>read()
+            .withValueTranslation(myValueTranslate)
+            .withConfiguration(serConf.getHadoopConfiguration())
+            .withKeyTranslation(myKeyTranslate);
+    assertEquals(serConf.getHadoopConfiguration(),
+        read.getConfiguration().getHadoopConfiguration());
+    assertEquals(myKeyTranslate, read.getKeyTranslationFunction());
+    assertEquals(myValueTranslate, read.getValueTranslationFunction());
+    assertEquals(myKeyTranslate.getOutputTypeDescriptor(), read.getKeyTypeDescriptor());
+    assertEquals(myValueTranslate.getOutputTypeDescriptor(), read.getValueTypeDescriptor());
+  }
+
+  /**
+   * This test validates {@link HadoopInputFormatIO.Read Read} object creation if
+   * {@link HadoopInputFormatIO.Read#withConfiguration() withConfiguration()} is called more than
+   * once.
+   * @throws InterruptedException
+   * @throws IOException
+   */
+  @Test
+  public void testReadBuildsCorrectlyIfWithConfigurationIsCalledMoreThanOneTime()
+      throws IOException, InterruptedException {
+    SerializableConfiguration diffConf =
+        loadTestConfiguration(
+            EmployeeInputFormat.class,
+            Employee.class,
+            Text.class);
+    HadoopInputFormatIO.Read<String, String> read = HadoopInputFormatIO.<String, String>read()
+        .withConfiguration(serConf.getHadoopConfiguration())
+        .withKeyTranslation(myKeyTranslate)
+        .withConfiguration(diffConf.getHadoopConfiguration());
+    assertEquals(diffConf.getHadoopConfiguration(),
+        read.getConfiguration().getHadoopConfiguration());
+    assertEquals(myKeyTranslate, read.getKeyTranslationFunction());
+    assertEquals(null, read.getValueTranslationFunction());
+    assertEquals(myKeyTranslate.getOutputTypeDescriptor(), read.getKeyTypeDescriptor());
+    assertEquals(diffConf.getHadoopConfiguration().getClass("value.class", Object.class), read
+        .getValueTypeDescriptor().getRawType());
+  }
+
+  /**
+   * This test validates functionality of {@link HadoopInputFormatIO.Read#populateDisplayData()
+   * populateDisplayData()}.
+   */
+  @Test
+  public void testReadDisplayData() {
+    HadoopInputFormatIO.Read<String, String> read = HadoopInputFormatIO.<String, String>read()
+        .withConfiguration(serConf.getHadoopConfiguration())
+        .withKeyTranslation(myKeyTranslate)
+        .withValueTranslation(myValueTranslate);
+    DisplayData displayData = DisplayData.from(read);
+    Iterator<Entry<String, String>> propertyElement = serConf.getHadoopConfiguration().iterator();
+    while (propertyElement.hasNext()) {
+      Entry<String, String> element = propertyElement.next();
+      assertThat(displayData, hasDisplayItem(element.getKey(), element.getValue()));
+    }
+  }
+
+  /**
+   * This test validates {@link HadoopInputFormatIO.Read Read} transform object creation fails with
+   * null configuration. {@link HadoopInputFormatIO.Read#withConfiguration() withConfiguration()}
+   * method checks configuration is null and throws exception if it is null.
+   */
+  @Test
+  public void testReadObjectCreationFailsIfConfigurationIsNull() {
+    thrown.expect(NullPointerException.class);
+    HadoopInputFormatIO.<Text, Employee>read()
+          .withConfiguration(null);
+  }
+
+  /**
+   * This test validates {@link HadoopInputFormatIO.Read Read} transform object creation with only
+   * configuration.
+   */
+  @Test
+  public void testReadObjectCreationWithConfiguration() {
+    HadoopInputFormatIO.Read<Text, Employee> read = HadoopInputFormatIO.<Text, Employee>read()
+        .withConfiguration(serConf.getHadoopConfiguration());
+    assertEquals(serConf.getHadoopConfiguration(),
+        read.getConfiguration().getHadoopConfiguration());
+    assertEquals(null, read.getKeyTranslationFunction());
+    assertEquals(null, read.getValueTranslationFunction());
+    assertEquals(serConf.getHadoopConfiguration().getClass("key.class", Object.class), read
+        .getKeyTypeDescriptor().getRawType());
+    assertEquals(serConf.getHadoopConfiguration().getClass("value.class", Object.class), read
+        .getValueTypeDescriptor().getRawType());
+  }
+
+  /**
+   * This test validates {@link HadoopInputFormatIO.Read Read} transform object creation fails with
+   * configuration and null key translation. {@link HadoopInputFormatIO.Read#withKeyTranslation()
+   * withKeyTranslation()} checks keyTranslation is null and throws exception if it null value is
+   * passed.
+   */
+  @Test
+  public void testReadObjectCreationFailsIfKeyTranslationFunctionIsNull() {
+    thrown.expect(NullPointerException.class);
+    HadoopInputFormatIO.<String, Employee>read()
+        .withConfiguration(serConf.getHadoopConfiguration())
+        .withKeyTranslation(null);
+  }
+
+  /**
+   * This test validates {@link HadoopInputFormatIO.Read Read} transform object creation with
+   * configuration and key translation.
+   */
+  @Test
+  public void testReadObjectCreationWithConfigurationKeyTranslation() {
+    HadoopInputFormatIO.Read<String, Employee> read = HadoopInputFormatIO.<String, Employee>read()
+        .withConfiguration(serConf.getHadoopConfiguration())
+        .withKeyTranslation(myKeyTranslate);
+    assertEquals(serConf.getHadoopConfiguration(),
+        read.getConfiguration().getHadoopConfiguration());
+    assertEquals(myKeyTranslate, read.getKeyTranslationFunction());
+    assertEquals(null, read.getValueTranslationFunction());
+    assertEquals(myKeyTranslate.getOutputTypeDescriptor().getRawType(),
+        read.getKeyTypeDescriptor().getRawType());
+    assertEquals(serConf.getHadoopConfiguration().getClass("value.class", Object.class),
+        read.getValueTypeDescriptor().getRawType());
+  }
+
+  /**
+   * This test validates {@link HadoopInputFormatIO.Read Read} transform object creation fails with
+   * configuration and null value translation.
+   * {@link HadoopInputFormatIO.Read#withValueTranslation() withValueTranslation()} checks
+   * valueTranslation is null and throws exception if null value is passed.
+   */
+  @Test
+  public void testReadObjectCreationFailsIfValueTranslationFunctionIsNull() {
+    thrown.expect(NullPointerException.class);
+    HadoopInputFormatIO.<Text, String>read()
+        .withConfiguration(serConf.getHadoopConfiguration())
+        .withValueTranslation(null);
+  }
+
+  /**
+   * This test validates {@link HadoopInputFormatIO.Read Read} transform object creation with
+   * configuration and value translation.
+   */
+  @Test
+  public void testReadObjectCreationWithConfigurationValueTranslation() {
+    HadoopInputFormatIO.Read<Text, String> read = HadoopInputFormatIO.<Text, String>read()
+        .withConfiguration(serConf.getHadoopConfiguration())
+        .withValueTranslation(myValueTranslate);
+    assertEquals(serConf.getHadoopConfiguration(),
+        read.getConfiguration().getHadoopConfiguration());
+    assertEquals(null, read.getKeyTranslationFunction());
+    assertEquals(myValueTranslate, read.getValueTranslationFunction());
+    assertEquals(serConf.getHadoopConfiguration().getClass("key.class", Object.class),
+        read.getKeyTypeDescriptor().getRawType());
+    assertEquals(myValueTranslate.getOutputTypeDescriptor().getRawType(),
+        read.getValueTypeDescriptor().getRawType());
+  }
+
+  /**
+   * This test validates {@link HadoopInputFormatIO.Read Read} transform object creation with
+   * configuration, key translation and value translation.
+   */
+  @Test
+  public void testReadObjectCreationWithConfigurationKeyTranslationValueTranslation() {
+    HadoopInputFormatIO.Read<String, String> read = HadoopInputFormatIO.<String, String>read()
+        .withConfiguration(serConf.getHadoopConfiguration())
+        .withKeyTranslation(myKeyTranslate)
+        .withValueTranslation(myValueTranslate);
+    assertEquals(serConf.getHadoopConfiguration(),
+        read.getConfiguration().getHadoopConfiguration());
+    assertEquals(myKeyTranslate, read.getKeyTranslationFunction());
+    assertEquals(myValueTranslate, read.getValueTranslationFunction());
+    assertEquals(myKeyTranslate.getOutputTypeDescriptor().getRawType(),
+        read.getKeyTypeDescriptor().getRawType());
+    assertEquals(myValueTranslate.getOutputTypeDescriptor().getRawType(),
+        read.getValueTypeDescriptor().getRawType());
+  }
+
+  /**
+   * This test validates functionality of {@link HadoopInputFormatIO.Read#validate()
+   * Read.validate()} function when Read transform is created without calling
+   * {@link HadoopInputFormatIO.Read#withConfiguration() withConfiguration()}.
+   */
+  @Test
+  public void testReadValidationFailsMissingConfiguration() {
+    HadoopInputFormatIO.Read<String, String> read = HadoopInputFormatIO.<String, String>read();
+    thrown.expect(NullPointerException.class);
+    read.validate(input);
+  }
+
+  /**
+   * This test validates functionality of {@link HadoopInputFormatIO.Read#withConfiguration()
+   * withConfiguration()} function when Hadoop InputFormat class is not provided by the user in
+   * configuration.
+   */
+  @Test
+  public void testReadValidationFailsMissingInputFormatInConf() {
+    Configuration configuration = new Configuration();
+    configuration.setClass("key.class", Text.class, Object.class);
+    configuration.setClass("value.class", Employee.class, Object.class);
+    thrown.expect(NullPointerException.class);
+    HadoopInputFormatIO.<Text, Employee>read()
+        .withConfiguration(configuration);
+  }
+
+  /**
+   * This test validates functionality of {@link HadoopInputFormatIO.Read#withConfiguration()
+   * withConfiguration()} function when key class is not provided by the user in configuration.
+   */
+  @Test
+  public void testReadValidationFailsMissingKeyClassInConf() {
+    Configuration configuration = new Configuration();
+    configuration.setClass("mapreduce.job.inputformat.class", EmployeeInputFormat.class,
+        InputFormat.class);
+    configuration.setClass("value.class", Employee.class, Object.class);
+    thrown.expect(NullPointerException.class);
+    HadoopInputFormatIO.<Text, Employee>read()
+        .withConfiguration(configuration);
+  }
+
+  /**
+   * This test validates functionality of {@link HadoopInputFormatIO.Read#withConfiguration()
+   * withConfiguration()} function when value class is not provided by the user in configuration.
+   */
+  @Test
+  public void testReadValidationFailsMissingValueClassInConf() {
+    Configuration configuration = new Configuration();
+    configuration.setClass("mapreduce.job.inputformat.class", EmployeeInputFormat.class,
+        InputFormat.class);
+    configuration.setClass("key.class", Text.class, Object.class);
+    thrown.expect(NullPointerException.class);
+    HadoopInputFormatIO.<Text, Employee>read().withConfiguration(configuration);
+  }
+
+  /**
+   * This test validates functionality of {@link HadoopInputFormatIO.Read#validate()
+   * Read.validate()} function when myKeyTranslate's (simple function provided by user for key
+   * translation) input type is not same as Hadoop InputFormat's keyClass(Which is property set in
+   * configuration as "key.class").
+   */
+  @Test
+  public void testReadValidationFailsWithWrongInputTypeKeyTranslationFunction() {
+    SimpleFunction<LongWritable, String> myKeyTranslateWithWrongInputType =
+        new SimpleFunction<LongWritable, String>() {
+          @Override
+          public String apply(LongWritable input) {
+            return input.toString();
+          }
+        };
+    HadoopInputFormatIO.Read<String, Employee> read = HadoopInputFormatIO.<String, Employee>read()
+        .withConfiguration(serConf.getHadoopConfiguration())
+        .withKeyTranslation(myKeyTranslateWithWrongInputType);
+    thrown.expect(IllegalArgumentException.class);
+    thrown.expectMessage(String.format(
+        "Key translation's input type is not same as hadoop InputFormat : %s key " + "class : %s",
+        serConf.getHadoopConfiguration().getClass("mapreduce.job.inputformat.class",
+            InputFormat.class), serConf.getHadoopConfiguration()
+            .getClass("key.class", Object.class)));
+    read.validate(input);
+  }
+
+  /**
+   * This test validates functionality of {@link HadoopInputFormatIO.Read#validate()
+   * Read.validate()} function when myValueTranslate's (simple function provided by user for value
+   * translation) input type is not same as Hadoop InputFormat's valueClass(Which is property set in
+   * configuration as "value.class").
+   */
+  @Test
+  public void testReadValidationFailsWithWrongInputTypeValueTranslationFunction() {
+    SimpleFunction<LongWritable, String> myValueTranslateWithWrongInputType =
+        new SimpleFunction<LongWritable, String>() {
+          @Override
+          public String apply(LongWritable input) {
+            return input.toString();
+          }
+        };
+    HadoopInputFormatIO.Read<Text, String> read =
+        HadoopInputFormatIO.<Text, String>read()
+            .withConfiguration(serConf.getHadoopConfiguration())
+            .withValueTranslation(myValueTranslateWithWrongInputType);
+    String expectedMessage =
+        String.format(
+            "Value translation's input type is not same as hadoop InputFormat :  "
+                + "%s value class : %s",
+            serConf.getHadoopConfiguration().getClass("mapreduce.job.inputformat.class",
+                InputFormat.class),
+            serConf.getHadoopConfiguration().getClass("value.class", Object.class));
+    thrown.expect(IllegalArgumentException.class);
+    thrown.expectMessage(expectedMessage);
+    read.validate(input);
+  }
+
+  @Test
+  public void testReadingData() throws Exception {
+    HadoopInputFormatIO.Read<Text, Employee> read = HadoopInputFormatIO.<Text, Employee>read()
+        .withConfiguration(serConf.getHadoopConfiguration());
+    List<KV<Text, Employee>> expected = TestEmployeeDataSet.getEmployeeData();
+    PCollection<KV<Text, Employee>> actual = p.apply("ReadTest", read);
+    PAssert.that(actual).containsInAnyOrder(expected);
+    p.run();
+  }
+
+  /**
+   * This test validates behavior of {@link HadoopInputFormatBoundedSource} if RecordReader object
+   * creation fails.
+   */
+  @Test
+  public void testReadIfCreateRecordReaderFails() throws Exception {
+    thrown.expect(Exception.class);
+    thrown.expectMessage("Exception in creating RecordReader");
+    InputFormat<Text, Employee> mockInputFormat = Mockito.mock(EmployeeInputFormat.class);
+    Mockito.when(
+        mockInputFormat.createRecordReader(Mockito.any(InputSplit.class),
+            Mockito.any(TaskAttemptContext.class))).thenThrow(
+        new IOException("Exception in creating RecordReader"));
+    HadoopInputFormatBoundedSource<Text, Employee> boundedSource =
+        new HadoopInputFormatBoundedSource<Text, Employee>(
+            serConf,
+            WritableCoder.of(Text.class),
+            AvroCoder.of(Employee.class),
+            null, // No key translation required.
+            null, // No value translation required.
+            new SerializableSplit());
+    boundedSource.setInputFormatObj(mockInputFormat);
+    SourceTestUtils.readFromSource(boundedSource, p.getOptions());
+  }
+
+  /**
+   * This test validates behavior of HadoopInputFormatSource if
+   * {@link InputFormat#createRecordReader() createRecordReader()} of InputFormat returns null.
+   */
+  @Test
+  public void testReadWithNullCreateRecordReader() throws Exception {
+    InputFormat<Text, Employee> mockInputFormat = Mockito.mock(EmployeeInputFormat.class);
+    thrown.expect(IOException.class);
+    thrown.expectMessage(String.format("Null RecordReader object returned by %s",
+            mockInputFormat.getClass()));
+    Mockito.when(
+        mockInputFormat.createRecordReader(Mockito.any(InputSplit.class),
+            Mockito.any(TaskAttemptContext.class))).thenReturn(null);
+    HadoopInputFormatBoundedSource<Text, Employee> boundedSource =
+        new HadoopInputFormatBoundedSource<Text, Employee>(
+            serConf,
+            WritableCoder.of(Text.class),
+            AvroCoder.of(Employee.class),
+            null, // No key translation required.
+            null, // No value translation required.
+            new SerializableSplit());
+    boundedSource.setInputFormatObj(mockInputFormat);
+    SourceTestUtils.readFromSource(boundedSource, p.getOptions());
+  }
+
+  /**
+   * This test validates behavior of
+   * {@link HadoopInputFormatBoundedSource.HadoopInputFormatReader#start() start()} method if
+   * InputFormat's {@link InputFormat#getSplits() getSplits()} returns InputSplitList having zero
+   * records.
+   */
+  @Test
+  public void testReadersStartWhenZeroRecords() throws Exception {
+    InputFormat<Text, Employee> mockInputFormat = Mockito.mock(EmployeeInputFormat.class);
+    EmployeeRecordReader mockReader = Mockito.mock(EmployeeRecordReader.class);
+    Mockito.when(
+        mockInputFormat.createRecordReader(Mockito.any(InputSplit.class),
+            Mockito.any(TaskAttemptContext.class))).thenReturn(mockReader);
+    Mockito.when(mockReader.nextKeyValue()).thenReturn(false);
+    InputSplit mockInputSplit = Mockito.mock(NewObjectsEmployeeInputSplit.class);
+    HadoopInputFormatBoundedSource<Text, Employee> boundedSource =
+        new HadoopInputFormatBoundedSource<Text, Employee>(
+            serConf,
+            WritableCoder.of(Text.class),
+            AvroCoder.of(Employee.class),
+            null, // No key translation required.
+            null, // No value translation required.
+            new SerializableSplit(mockInputSplit));
+    BoundedReader<KV<Text, Employee>> boundedReader = boundedSource.createReader(p.getOptions());
+    assertEquals(false, boundedReader.start());
+    assertEquals(Double.valueOf(1), boundedReader.getFractionConsumed());
+  }
+
+  /**
+   * This test validates the method getFractionConsumed()- which indicates the progress of the read
+   * in range of 0 to 1.
+   */
+  @Test
+  public void testReadersGetFractionConsumed() throws Exception {
+    List<KV<Text, Employee>> referenceRecords = TestEmployeeDataSet.getEmployeeData();
+    HadoopInputFormatBoundedSource<Text, Employee> hifSource = getTestHIFSource(
+        EmployeeInputFormat.class,
+        Text.class,
+        Employee.class,
+        WritableCoder.of(Text.class),
+        AvroCoder.of(Employee.class));
+    long estimatedSize = hifSource.getEstimatedSizeBytes(p.getOptions());
+    // Validate if estimated size is equal to the size of records.
+    assertEquals(referenceRecords.size(), estimatedSize);
+    List<BoundedSource<KV<Text, Employee>>> boundedSourceList =
+        hifSource.splitIntoBundles(0, p.getOptions());
+    // Validate if splitIntoBundles() has split correctly.
+    assertEquals(TestEmployeeDataSet.NUMBER_OF_SPLITS, boundedSourceList.size());
+    List<KV<Text, Employee>> bundleRecords = new ArrayList<>();
+    for (BoundedSource<KV<Text, Employee>> source : boundedSourceList) {
+      List<KV<Text, Employee>> elements = new ArrayList<KV<Text, Employee>>();
+      BoundedReader<KV<Text, Employee>> reader = source.createReader(p.getOptions());
+      float recordsRead = 0;
+      // When start is not called, getFractionConsumed() should return 0.
+      assertEquals(Double.valueOf(0), reader.getFractionConsumed());
+      boolean start = reader.start();
+      assertEquals(true, start);
+      if (start) {
+        elements.add(reader.getCurrent());
+        boolean advance = reader.advance();
+        // Validate if getFractionConsumed() returns the correct fraction based on
+        // the number of records read in the split.
+        assertEquals(
+            Double.valueOf(++recordsRead / TestEmployeeDataSet.NUMBER_OF_RECORDS_IN_EACH_SPLIT),
+            reader.getFractionConsumed());
+        assertEquals(true, advance);
+        while (advance) {
+          elements.add(reader.getCurrent());
+          advance = reader.advance();
+          assertEquals(
+              Double.valueOf(++recordsRead / TestEmployeeDataSet.NUMBER_OF_RECORDS_IN_EACH_SPLIT),
+              reader.getFractionConsumed());
+        }
+        bundleRecords.addAll(elements);
+      }
+      // Validate if getFractionConsumed() returns 1 after reading is complete.
+      assertEquals(Double.valueOf(1), reader.getFractionConsumed());
+      reader.close();
+    }
+    assertThat(bundleRecords, containsInAnyOrder(referenceRecords.toArray()));
+  }
+
+  /**
+   * This test validates that reader and its parent source reads the same records.
+   */
+  @Test
+  public void testReaderAndParentSourceReadsSameData() throws Exception {
+    InputSplit mockInputSplit = Mockito.mock(NewObjectsEmployeeInputSplit.class);
+    HadoopInputFormatBoundedSource<Text, Employee> boundedSource =
+        new HadoopInputFormatBoundedSource<Text, Employee>(
+            serConf,
+            WritableCoder.of(Text.class),
+            AvroCoder.of(Employee.class),
+            null, // No key translation required.
+            null, // No value translation required.
+            new SerializableSplit(mockInputSplit));
+    BoundedReader<KV<Text, Employee>> reader = boundedSource
+        .createReader(p.getOptions());
+    SourceTestUtils.assertUnstartedReaderReadsSameAsItsSource(reader, p.getOptions());
+  }
+
+  /**
+   * This test verifies that the method
+   * {@link HadoopInputFormatBoundedSource.HadoopInputFormatReader#getCurrentSource()
+   * getCurrentSource()} returns correct source object.
+   */
+  @Test
+  public void testGetCurrentSourceFunction() throws Exception {
+    SerializableSplit split = new SerializableSplit();
+    BoundedSource<KV<Text, Employee>> source =
+        new HadoopInputFormatBoundedSource<Text, Employee>(
+            serConf,
+            WritableCoder.of(Text.class),
+            AvroCoder.of(Employee.class),
+            null, // No key translation required.
+            null, // No value translation required.
+            split);
+    BoundedReader<KV<Text, Employee>> hifReader = source.createReader(p.getOptions());
+    BoundedSource<KV<Text, Employee>> hifSource = hifReader.getCurrentSource();
+    assertEquals(hifSource, source);
+  }
+
+  /**
+   * This test validates behavior of {@link HadoopInputFormatBoundedSource#createReader()
+   * createReader()} method when {@link HadoopInputFormatBoundedSource#splitIntoBundles()
+   * splitIntoBundles()} is not called.
+   */
+  @Test
+  public void testCreateReaderIfSplitIntoBundlesNotCalled() throws Exception {
+    HadoopInputFormatBoundedSource<Text, Employee> hifSource = getTestHIFSource(
+        EmployeeInputFormat.class,
+        Text.class,
+        Employee.class,
+        WritableCoder.of(Text.class),
+        AvroCoder.of(Employee.class));
+    thrown.expect(IOException.class);
+    thrown.expectMessage("Cannot create reader as source is not split yet.");
+    hifSource.createReader(p.getOptions());
+  }
+
+  /**
+   * This test validates behavior of
+   * {@link HadoopInputFormatBoundedSource#computeSplitsIfNecessary() computeSplits()} when Hadoop
+   * InputFormat's {@link InputFormat#getSplits() getSplits()} returns empty list.
+   */
+  @Test
+  public void testComputeSplitsIfGetSplitsReturnsEmptyList() throws Exception {
+    InputFormat<?, ?> mockInputFormat = Mockito.mock(EmployeeInputFormat.class);
+    SerializableSplit mockInputSplit = Mockito.mock(SerializableSplit.class);
+    Mockito.when(mockInputFormat.getSplits(Mockito.any(JobContext.class))).thenReturn(
+        new ArrayList<InputSplit>());
+    HadoopInputFormatBoundedSource<Text, Employee> hifSource =
+        new HadoopInputFormatBoundedSource<Text, Employee>(
+            serConf,
+            WritableCoder.of(Text.class),
+            AvroCoder.of(Employee.class),
+            null, // No key translation required.
+            null, // No value translation required.
+            mockInputSplit);
+    thrown.expect(IOException.class);
+    thrown.expectMessage("Error in computing splits, getSplits() returns a empty list");
+    hifSource.setInputFormatObj(mockInputFormat);
+    hifSource.computeSplitsIfNecessary();
+  }
+
+  /**
+   * This test validates behavior of
+   * {@link HadoopInputFormatBoundedSource#computeSplitsIfNecessary() computeSplits()} when Hadoop
+   * InputFormat's {@link InputFormat#getSplits() getSplits()} returns NULL value.
+   */
+  @Test
+  public void testComputeSplitsIfGetSplitsReturnsNullValue() throws Exception {
+    InputFormat<Text, Employee> mockInputFormat = Mockito.mock(EmployeeInputFormat.class);
+    SerializableSplit mockInputSplit = Mockito.mock(SerializableSplit.class);
+    Mockito.when(mockInputFormat.getSplits(Mockito.any(JobContext.class))).thenReturn(null);
+    HadoopInputFormatBoundedSource<Text, Employee> hifSource =
+        new HadoopInputFormatBoundedSource<Text, Employee>(
+            serConf,
+            WritableCoder.of(Text.class),
+            AvroCoder.of(Employee.class),
+            null, // No key translation required.
+            null, // No value translation required.
+            mockInputSplit);
+    thrown.expect(IOException.class);
+    thrown.expectMessage("Error in computing splits, getSplits() returns null.");
+    hifSource.setInputFormatObj(mockInputFormat);
+    hifSource.computeSplitsIfNecessary();
+  }
+
+  /**
+   * This test validates behavior of
+   * {@link HadoopInputFormatBoundedSource#computeSplitsIfNecessary() computeSplits()} if Hadoop
+   * InputFormat's {@link InputFormat#getSplits() getSplits()} returns InputSplit list having some
+   * null values.
+   */
+  @Test
+  public void testComputeSplitsIfGetSplitsReturnsListHavingNullValues() throws Exception {
+    // InputSplit list having null value.
+    InputSplit mockInputSplit =
+        Mockito.mock(InputSplit.class, Mockito.withSettings().extraInterfaces(Writable.class));
+    List<InputSplit> inputSplitList = new ArrayList<InputSplit>();
+    inputSplitList.add(mockInputSplit);
+    inputSplitList.add(null);
+    InputFormat<Text, Employee> mockInputFormat = Mockito.mock(EmployeeInputFormat.class);
+    Mockito.when(mockInputFormat.getSplits(Mockito.any(JobContext.class))).thenReturn(
+        inputSplitList);
+    HadoopInputFormatBoundedSource<Text, Employee> hifSource =
+        new HadoopInputFormatBoundedSource<Text, Employee>(
+            serConf,
+            WritableCoder.of(Text.class),
+            AvroCoder.of(Employee.class),
+            null, // No key translation required.
+            null, // No value translation required.
+            new SerializableSplit());
+    thrown.expect(IOException.class);
+    thrown.expectMessage("Error in computing splits, split is null in InputSplits list populated "
+        + "by getSplits() : ");
+    hifSource.setInputFormatObj(mockInputFormat);
+    hifSource.computeSplitsIfNecessary();
+  }
+
+  /**
+   * This test validates records emitted in PCollection are immutable if InputFormat's recordReader
+   * returns same objects(i.e. same locations in memory) but with updated values for each record.
+   */
+  @Test
+  public void testImmutablityOfOutputOfReadIfRecordReaderObjectsAreMutable() throws Exception {
+    List<BoundedSource<KV<Text, Employee>>> boundedSourceList = getBoundedSourceList(
+       ReuseObjectsEmployeeInputFormat.class,
+       Text.class,
+       Employee.class,
+       WritableCoder.of(Text.class),
+       AvroCoder.of(Employee.class));
+    List<KV<Text, Employee>> bundleRecords = new ArrayList<>();
+    for (BoundedSource<KV<Text, Employee>> source : boundedSourceList) {
+      List<KV<Text, Employee>> elems = SourceTestUtils.readFromSource(source, p.getOptions());
+      bundleRecords.addAll(elems);
+    }
+    List<KV<Text, Employee>> referenceRecords = TestEmployeeDataSet.getEmployeeData();
+    assertThat(bundleRecords, containsInAnyOrder(referenceRecords.toArray()));
+  }
+
+  /**
+   * Test reading if InputFormat implements {@link org.apache.hadoop.conf.Configurable
+   * Configurable}.
+   */
+  @Test
+  public void testReadingWithConfigurableInputFormat() throws Exception {
+    List<BoundedSource<KV<Text, Employee>>> boundedSourceList = getBoundedSourceList(
+        ConfigurableEmployeeInputFormat.class,
+        Text.class,
+        Employee.class,
+        WritableCoder.of(Text.class),
+        AvroCoder.of(Employee.class));
+    for (BoundedSource<KV<Text, Employee>> source : boundedSourceList) {
+      // Cast to HadoopInputFormatBoundedSource to access getInputFormat().
+      @SuppressWarnings("unchecked")
+      HadoopInputFormatBoundedSource<Text, Employee> hifSource =
+          (HadoopInputFormatBoundedSource<Text, Employee>) source;
+      hifSource.createInputFormatInstance();
+      ConfigurableEmployeeInputFormat inputFormatObj =
+          (ConfigurableEmployeeInputFormat) hifSource.getInputFormat();
+      assertEquals(true, inputFormatObj.isConfSet);
+    }
+  }
+
+  /**
+   * This test validates records emitted in PCollection are immutable if InputFormat's
+   * {@link org.apache.hadoop.mapreduce.RecordReader RecordReader} returns different objects (i.e.
+   * different locations in memory).
+   */
+  @Test
+  public void testImmutablityOfOutputOfReadIfRecordReaderObjectsAreImmutable() throws Exception {
+   List<BoundedSource<KV<Text, Employee>>> boundedSourceList = getBoundedSourceList(
+       EmployeeInputFormat.class,
+       Text.class,
+       Employee.class,
+       WritableCoder.of(Text.class),
+       AvroCoder.of(Employee.class));
+    List<KV<Text, Employee>> bundleRecords = new ArrayList<>();
+    for (BoundedSource<KV<Text, Employee>> source : boundedSourceList) {
+      List<KV<Text, Employee>> elems = SourceTestUtils.readFromSource(source, p.getOptions());
+      bundleRecords.addAll(elems);
+    }
+    List<KV<Text, Employee>> referenceRecords = TestEmployeeDataSet.getEmployeeData();
+    assertThat(bundleRecords, containsInAnyOrder(referenceRecords.toArray()));
+  }
+
+  private static SerializableConfiguration loadTestConfiguration(Class<?> inputFormatClassName,
+      Class<?> keyClass, Class<?> valueClass) {
+    Configuration conf = new Configuration();
+    conf.setClass("mapreduce.job.inputformat.class", inputFormatClassName, InputFormat.class);
+    conf.setClass("key.class", keyClass, Object.class);
+    conf.setClass("value.class", valueClass, Object.class);
+    return new SerializableConfiguration(conf);
+  }
+
+  private <K, V> HadoopInputFormatBoundedSource<K, V> getTestHIFSource(
+      Class<?> inputFormatClass,
+      Class<K> inputFormatKeyClass,
+      Class<V> inputFormatValueClass,
+      Coder<K> keyCoder,
+      Coder<V> valueCoder){
+    SerializableConfiguration serConf =
+        loadTestConfiguration(
+            inputFormatClass,
+            inputFormatKeyClass,
+            inputFormatValueClass);
+    return new HadoopInputFormatBoundedSource<K, V>(
+            serConf,
+            keyCoder,
+            valueCoder,
+            null, // No key translation required.
+            null); // No value translation required.
+  }
+
+  private <K, V> List<BoundedSource<KV<K, V>>> getBoundedSourceList(
+      Class<?> inputFormatClass,
+      Class<K> inputFormatKeyClass,
+      Class<V> inputFormatValueClass,
+      Coder<K> keyCoder,
+      Coder<V> valueCoder) throws Exception{
+    HadoopInputFormatBoundedSource<K, V> boundedSource = getTestHIFSource(
+        inputFormatClass,
+        inputFormatKeyClass,
+        inputFormatValueClass,
+        keyCoder,
+        valueCoder);
+    return boundedSource.splitIntoBundles(0, p.getOptions());
+  }
+}

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop/input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/ReuseObjectsEmployeeInputFormat.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop/input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/ReuseObjectsEmployeeInputFormat.java b/sdks/java/io/hadoop/input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/ReuseObjectsEmployeeInputFormat.java
new file mode 100644
index 0000000..fbe74ec
--- /dev/null
+++ b/sdks/java/io/hadoop/input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/ReuseObjectsEmployeeInputFormat.java
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package org.apache.beam.sdk.io.hadoop.inputformat;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.beam.sdk.values.KV;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.InputFormat;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+
+/**
+ * This is a valid InputFormat for reading employee data which is available in the form of
+ * {@code List<KV>} as {@linkplain ReuseObjectsEmployeeRecordReader#employeeDataList
+ * employeeDataList}. {@linkplain ReuseObjectsEmployeeRecordReader#employeeDataList
+ * employeeDataList} is populated using {@linkplain TestEmployeeDataSet#populateEmployeeDataNew()}.
+ *
+ * <p>{@linkplain ReuseObjectsEmployeeInputFormat} splits data into
+ * {@value TestEmployeeDataSet#NUMBER_OF_SPLITS} splits, each split having
+ * {@value TestEmployeeDataSet#NUMBER_OF_RECORDS_IN_EACH_SPLIT} records each.
+ * {@linkplain ReuseObjectsEmployeeInputFormat} reads data from
+ * {@linkplain ReuseObjectsEmployeeRecordReader#employeeDataList employeeDataList} and produces a
+ * key (employee id) of type Text and value of type {@linkplain Employee Employee}.
+ *
+ * <p>{@linkplain ReuseObjectsEmployeeInputFormat} is also input to test whether
+ * {@linkplain HadoopInputFormatIO } source returns immutable records for a scenario when
+ * RecordReader returns the same key and value objects with updating values every time it reads
+ * data.
+ */
+public class ReuseObjectsEmployeeInputFormat extends InputFormat<Text, Employee> {
+
+  public ReuseObjectsEmployeeInputFormat() {}
+
+  @Override
+  public RecordReader<Text, Employee> createRecordReader(InputSplit split,
+      TaskAttemptContext context) throws IOException, InterruptedException {
+    return new ReuseObjectsEmployeeRecordReader();
+  }
+
+  @Override
+  public List<InputSplit> getSplits(JobContext arg0) throws IOException, InterruptedException {
+    List<InputSplit> inputSplitList = new ArrayList<InputSplit>();
+    for (int i = 1; i <= TestEmployeeDataSet.NUMBER_OF_SPLITS; i++) {
+      InputSplit inputSplitObj = new ReuseEmployeeInputSplit(
+          ((i - 1) * TestEmployeeDataSet.NUMBER_OF_RECORDS_IN_EACH_SPLIT),
+          (i * TestEmployeeDataSet.NUMBER_OF_RECORDS_IN_EACH_SPLIT - 1));
+      inputSplitList.add(inputSplitObj);
+    }
+    return inputSplitList;
+  }
+
+  /**
+   * InputSplit implementation for ReuseObjectsEmployeeInputFormat.
+   */
+  public class ReuseEmployeeInputSplit extends InputSplit implements Writable {
+    // Start and end map index of each split of employeeData.
+    private long startIndex;
+    private long endIndex;
+
+    public ReuseEmployeeInputSplit() {}
+
+    public ReuseEmployeeInputSplit(long startIndex, long endIndex) {
+      this.startIndex = startIndex;
+      this.endIndex = endIndex;
+    }
+
+    /** Returns number of records in each split. */
+    @Override
+    public long getLength() throws IOException, InterruptedException {
+      return this.endIndex - this.startIndex + 1;
+    }
+
+    @Override
+    public String[] getLocations() throws IOException, InterruptedException {
+      return null;
+    }
+
+
+    public long getStartIndex() {
+      return startIndex;
+    }
+
+    public long getEndIndex() {
+      return endIndex;
+    }
+
+    @Override
+    public void readFields(DataInput dataIn) throws IOException {
+      startIndex = dataIn.readLong();
+      endIndex = dataIn.readLong();
+    }
+
+    @Override
+    public void write(DataOutput dataOut) throws IOException {
+      dataOut.writeLong(startIndex);
+      dataOut.writeLong(endIndex);
+    }
+  }
+
+  /**
+   * RecordReader for ReuseObjectsEmployeeInputFormat.
+   */
+  public class ReuseObjectsEmployeeRecordReader extends RecordReader<Text, Employee> {
+
+    private ReuseEmployeeInputSplit split;
+    private Text currentKey = new Text();
+    private Employee currentValue = new Employee();
+    private long employeeListIndex = 0L;
+    private long recordsRead = 0L;
+    private List<KV<String, String>> employeeDataList;
+
+    public ReuseObjectsEmployeeRecordReader() {}
+
+    @Override
+    public void close() throws IOException {}
+
+    @Override
+    public Text getCurrentKey() throws IOException, InterruptedException {
+      return currentKey;
+    }
+
+    @Override
+    public Employee getCurrentValue() throws IOException, InterruptedException {
+      return currentValue;
+    }
+
+    @Override
+    public float getProgress() throws IOException, InterruptedException {
+      return (float) recordsRead / split.getLength();
+    }
+
+    @Override
+    public void initialize(InputSplit split, TaskAttemptContext arg1)
+        throws IOException, InterruptedException {
+      this.split = (ReuseEmployeeInputSplit) split;
+      employeeListIndex = this.split.getStartIndex() - 1;
+      recordsRead = 0;
+      employeeDataList = TestEmployeeDataSet.populateEmployeeData();
+    }
+
+    @Override
+    public boolean nextKeyValue() throws IOException, InterruptedException {
+      if ((recordsRead++) >= split.getLength()) {
+        return false;
+      }
+      employeeListIndex++;
+      KV<String, String> employeeDetails = employeeDataList.get((int) employeeListIndex);
+      String empData[] = employeeDetails.getValue().split("_");
+      // Updating the same key and value objects with new employee data.
+      currentKey.set(employeeDetails.getKey());
+      currentValue.setEmpName(empData[0]);
+      currentValue.setEmpAddress(empData[1]);
+      return true;
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop/input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/TestEmployeeDataSet.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop/input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/TestEmployeeDataSet.java b/sdks/java/io/hadoop/input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/TestEmployeeDataSet.java
new file mode 100644
index 0000000..4a8fe95
--- /dev/null
+++ b/sdks/java/io/hadoop/input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/TestEmployeeDataSet.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package org.apache.beam.sdk.io.hadoop.inputformat;
+
+import com.google.common.base.Function;
+import com.google.common.collect.Lists;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.beam.sdk.values.KV;
+import org.apache.hadoop.io.Text;
+/**
+ * Test Utils used in {@link EmployeeInputFormat} and {@link ReuseObjectsEmployeeInputFormat} for
+ * computing splits.
+ */
+public class TestEmployeeDataSet {
+  public static final long NUMBER_OF_RECORDS_IN_EACH_SPLIT = 5L;
+  public static final long NUMBER_OF_SPLITS = 3L;
+  private static final List<KV<String, String>> data = new ArrayList<KV<String, String>>();
+
+  /**
+   * Returns List of employee details. Employee details are available in the form of {@link KV} in
+   * which, key indicates employee id and value indicates employee details such as name and address
+   * separated by '_'. This is data input to {@link EmployeeInputFormat} and
+   * {@link ReuseObjectsEmployeeInputFormat}.
+   */
+  public static List<KV<String, String>> populateEmployeeData() {
+    if (!data.isEmpty()) {
+      return data;
+    }
+    data.add(KV.of("0", "Alex_US"));
+    data.add(KV.of("1", "John_UK"));
+    data.add(KV.of("2", "Tom_UK"));
+    data.add(KV.of("3", "Nick_UAE"));
+    data.add(KV.of("4", "Smith_IND"));
+    data.add(KV.of("5", "Taylor_US"));
+    data.add(KV.of("6", "Gray_UK"));
+    data.add(KV.of("7", "James_UAE"));
+    data.add(KV.of("8", "Jordan_IND"));
+    data.add(KV.of("9", "Leena_UK"));
+    data.add(KV.of("10", "Zara_UAE"));
+    data.add(KV.of("11", "Talia_IND"));
+    data.add(KV.of("12", "Rose_UK"));
+    data.add(KV.of("13", "Kelvin_UAE"));
+    data.add(KV.of("14", "Goerge_IND"));
+    return data;
+  }
+
+  /**
+   * This is a helper function used in unit tests for validating data against data read using
+   * {@link EmployeeInputFormat} and {@link ReuseObjectsEmployeeInputFormat}.
+   */
+  public static List<KV<Text, Employee>> getEmployeeData() {
+    return Lists.transform((data.isEmpty() ? populateEmployeeData() : data),
+        new Function<KV<String, String>, KV<Text, Employee>>() {
+          @Override
+          public KV<Text, Employee> apply(KV<String, String> input) {
+            String[] empData = input.getValue().split("_");
+            return KV.of(new Text(input.getKey()), new Employee(empData[0], empData[1]));
+          }
+        });
+  }
+}

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop/jdk1.8-tests/pom.xml
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop/jdk1.8-tests/pom.xml b/sdks/java/io/hadoop/jdk1.8-tests/pom.xml
new file mode 100644
index 0000000..4c510ae
--- /dev/null
+++ b/sdks/java/io/hadoop/jdk1.8-tests/pom.xml
@@ -0,0 +1,278 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    (the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+-->
+
+<!-- The HifIO tests for Cassandra and Elasticsearch both work only with
+  jdk1.8, but Beam's enforcer rules require jdk1.7 and jdk1.8 support. This
+  child module contains only those tests and overrides the enforcer rules to
+  allow 1.8 only behavior without making all of HifIO work only with jdk1.8. -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.beam</groupId>
+    <artifactId>beam-sdks-java-io-hadoop-parent</artifactId>
+    <version>0.7.0-SNAPSHOT</version>
+    <relativePath>../pom.xml</relativePath>
+  </parent>
+  <artifactId>beam-sdks-java-io-hadoop-jdk1.8-tests</artifactId>
+  <name>Apache Beam :: SDKs :: Java :: IO :: Hadoop :: jdk1.8-tests</name>
+  <description>Integration tests and junits which need JDK1.8.</description>
+
+  <build>
+    <plugins>
+      <plugin>
+       <!-- Guava shading is required as Cassandra tests require version
+       19 of Guava, by default project wide Guava shading may not suffice as it
+       loads a different version of guava which will not work for Cassandra tests -->
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-shade-plugin</artifactId>
+        <executions>
+          <execution>
+            <phase>package</phase>
+            <goals>
+              <goal>shade</goal>
+            </goals>
+            <configuration>
+              <artifactSet>
+                <includes>
+                  <include>com.google.guava:guava:19.0</include>
+                </includes>
+              </artifactSet>
+              <relocations>
+                <relocation>
+                  <pattern>com.google.common</pattern>
+                  <shadedPattern>org.apache.beam.sdk.io.hadoop.jdk1.8-tests.repackaged.com.google.common</shadedPattern>
+                </relocation>
+               <relocation>
+                 <pattern>com.google.thirdparty</pattern>
+                 <shadedPattern>org.apache.beam.sdk.io.hadoop.jdk1.8-tests.repackaged.com.google.thirdparty</shadedPattern>
+                 </relocation>
+               </relocations>
+               <transformers>
+                 <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
+               </transformers>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+      <!-- Overridden enforcer plugin for JDK1.8 for running tests -->
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-enforcer-plugin</artifactId>
+        <version>1.4.1</version>
+        <executions>
+          <execution>
+            <id>enforce</id>
+            <goals>
+              <goal>enforce</goal>
+            </goals>
+            <configuration>
+              <rules>
+                <enforceBytecodeVersion>
+                  <maxJdkVersion>1.8</maxJdkVersion>
+                  <excludes>
+                    <!-- Supplied by the user JDK and compiled with matching
+                      version. Is not shaded, so safe to ignore. -->
+                    <exclude>jdk.tools:jdk.tools</exclude>
+                  </excludes>
+                </enforceBytecodeVersion>
+                <requireJavaVersion>
+                  <version>[1.8,)</version>
+                </requireJavaVersion>
+              </rules>
+            </configuration>
+          </execution>
+        </executions>
+        <dependencies>
+          <dependency>
+            <groupId>org.codehaus.mojo</groupId>
+            <artifactId>extra-enforcer-rules</artifactId>
+            <version>1.0-beta-6</version>
+          </dependency>
+        </dependencies>
+      </plugin>
+    </plugins>
+  </build>
+  <!--The dataflow-runner and spark-runner profiles support using those runners
+    during an integration test. These are not the long-term way we want to support
+    using runners in ITs (e.g. it is annoying to add to all IO modules.) We cannot
+    create a dependency IO -> Runners since the runners depend on IO (e.g. kafka
+    depends on spark.) -->
+
+  <profiles>
+    <!-- Include the Apache Spark runner -P spark-runner -->
+    <profile>
+      <id>spark-runner</id>
+      <dependencies>
+        <dependency>
+          <groupId>org.apache.beam</groupId>
+          <artifactId>beam-runners-spark</artifactId>
+          <scope>runtime</scope>
+        </dependency>
+        <dependency>
+          <groupId>org.apache.spark</groupId>
+          <artifactId>spark-streaming_2.10</artifactId>
+          <version>${spark.version}</version>
+          <scope>runtime</scope>
+        </dependency>
+        <dependency>
+          <groupId>org.apache.spark</groupId>
+          <artifactId>spark-core_2.10</artifactId>
+          <version>${spark.version}</version>
+          <scope>runtime</scope>
+          <exclusions>
+            <exclusion>
+              <groupId>org.slf4j</groupId>
+              <artifactId>jul-to-slf4j</artifactId>
+            </exclusion>
+          </exclusions>
+        </dependency>
+      </dependencies>
+    </profile>
+
+    <!-- Include the Google Cloud Dataflow runner -P dataflow-runner -->
+    <profile>
+      <id>dataflow-runner</id>
+      <dependencies>
+        <dependency>
+          <groupId>org.apache.beam</groupId>
+          <artifactId>beam-runners-google-cloud-dataflow-java</artifactId>
+          <scope>runtime</scope>
+        </dependency>
+      </dependencies>
+    </profile>
+  </profiles>
+
+  <properties>
+    <log4j.core.version>2.6.2</log4j.core.version>
+    <hadoop.common.version>2.7.0</hadoop.common.version>
+    <guava.version>19.0</guava.version>
+    <transport.netty4.client.version>5.0.0</transport.netty4.client.version>
+    <netty.transport.native.epoll.version>4.1.0.CR3</netty.transport.native.epoll.version>
+    <elasticsearch.version>5.0.0</elasticsearch.version>
+    <cassandra.driver.mapping.version>3.1.1</cassandra.driver.mapping.version>
+    <cassandra.all.verison>3.9</cassandra.all.verison>
+    <cassandra.driver.core.version>3.1.1</cassandra.driver.core.version>
+    <commons.io.version>2.4</commons.io.version>
+  </properties>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.beam</groupId>
+      <artifactId>beam-sdks-java-io-hadoop-input-format</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>com.google.guava</groupId>
+      <artifactId>guava</artifactId>
+      <version>${guava.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-api</artifactId>
+    </dependency>
+
+    <!-- compile dependencies -->
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-common</artifactId>
+      <version>${hadoop.common.version}</version>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-mapreduce-client-core</artifactId>
+      <version>${hadoop.common.version}</version>
+      <scope>provided</scope>
+    </dependency>
+
+    <!-- test dependencies -->
+    <dependency>
+      <groupId>org.apache.beam</groupId>
+      <artifactId>beam-runners-direct-java</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.logging.log4j</groupId>
+      <artifactId>log4j-core</artifactId>
+      <version>2.6.2</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.elasticsearch.plugin</groupId>
+      <artifactId>transport-netty4-client</artifactId>
+      <version>${transport.netty4.client.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.elasticsearch.client</groupId>
+      <artifactId>transport</artifactId>
+      <version>${elasticsearch.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>io.netty</groupId>
+      <artifactId>netty-transport-native-epoll</artifactId>
+      <version>${netty.transport.native.epoll.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.elasticsearch</groupId>
+      <artifactId>elasticsearch</artifactId>
+      <version>${elasticsearch.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.elasticsearch</groupId>
+      <artifactId>elasticsearch-hadoop</artifactId>
+      <version>${elasticsearch.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>com.datastax.cassandra</groupId>
+      <artifactId>cassandra-driver-mapping</artifactId>
+      <version>${cassandra.driver.mapping.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.cassandra</groupId>
+      <artifactId>cassandra-all</artifactId>
+      <version>${cassandra.all.verison}</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.hamcrest</groupId>
+      <artifactId>hamcrest-all</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.datastax.cassandra</groupId>
+      <artifactId>cassandra-driver-core</artifactId>
+      <version>${cassandra.driver.core.version}</version>
+      <scope>test</scope>
+    </dependency>
+
+    <!-- runtime dependencies -->
+    <dependency>
+      <groupId>commons-io</groupId>
+      <artifactId>commons-io</artifactId>
+      <version>${commons.io.version}</version>
+      <scope>runtime</scope>
+    </dependency>
+  </dependencies>
+
+</project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop/jdk1.8-tests/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/HIFIOWithElasticTest.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop/jdk1.8-tests/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/HIFIOWithElasticTest.java b/sdks/java/io/hadoop/jdk1.8-tests/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/HIFIOWithElasticTest.java
new file mode 100644
index 0000000..599a4a1
--- /dev/null
+++ b/sdks/java/io/hadoop/jdk1.8-tests/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/HIFIOWithElasticTest.java
@@ -0,0 +1,277 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.sdk.io.hadoop.inputformat;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.beam.sdk.io.hadoop.inputformat.hashing.HashingFn;
+import org.apache.beam.sdk.testing.PAssert;
+import org.apache.beam.sdk.testing.TestPipeline;
+import org.apache.beam.sdk.transforms.Combine;
+import org.apache.beam.sdk.transforms.Count;
+import org.apache.beam.sdk.transforms.MapElements;
+import org.apache.beam.sdk.transforms.SimpleFunction;
+import org.apache.beam.sdk.transforms.Values;
+import org.apache.beam.sdk.values.KV;
+import org.apache.beam.sdk.values.PCollection;
+import org.apache.commons.io.FileUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.InputFormat;
+import org.elasticsearch.action.admin.indices.create.CreateIndexRequest;
+import org.elasticsearch.action.admin.indices.delete.DeleteIndexRequest;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.hadoop.cfg.ConfigurationOptions;
+import org.elasticsearch.hadoop.mr.EsInputFormat;
+import org.elasticsearch.hadoop.mr.LinkedMapWritable;
+import org.elasticsearch.node.Node;
+import org.elasticsearch.node.NodeValidationException;
+import org.elasticsearch.node.internal.InternalSettingsPreparer;
+import org.elasticsearch.plugins.Plugin;
+import org.elasticsearch.transport.Netty4Plugin;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Tests to validate HadoopInputFormatIO for embedded Elasticsearch instance.
+ *
+ * {@link EsInputFormat} can be used to read data from Elasticsearch. EsInputFormat by default
+ * returns key class as Text and value class as LinkedMapWritable. You can also set MapWritable as
+ * value class, provided that you set the property "mapred.mapoutput.value.class" with
+ * MapWritable.class. If this property is not set then, using MapWritable as value class may give
+ * org.apache.beam.sdk.coders.CoderException due to unexpected extra bytes after decoding.
+ */
+
+@RunWith(JUnit4.class)
+public class HIFIOWithElasticTest implements Serializable {
+
+  private static final long serialVersionUID = 1L;
+  private static final Logger LOGGER = LoggerFactory.getLogger(HIFIOWithElasticTest.class);
+  private static final String ELASTIC_IN_MEM_HOSTNAME = "127.0.0.1";
+  private static final String ELASTIC_IN_MEM_PORT = "9200";
+  private static final String ELASTIC_INTERNAL_VERSION = "5.x";
+  private static final String TRUE = "true";
+  private static final String ELASTIC_INDEX_NAME = "beamdb";
+  private static final String ELASTIC_TYPE_NAME = "scientists";
+  private static final String ELASTIC_RESOURCE = "/" + ELASTIC_INDEX_NAME + "/" + ELASTIC_TYPE_NAME;
+  private static final int TEST_DATA_ROW_COUNT = 10;
+  private static final String ELASTIC_TYPE_ID_PREFIX = "s";
+
+  @ClassRule
+  public static TemporaryFolder elasticTempFolder = new TemporaryFolder();
+
+  @Rule
+  public final transient TestPipeline pipeline = TestPipeline.create();
+
+  @BeforeClass
+  public static void startServer()
+      throws NodeValidationException, InterruptedException, IOException {
+    ElasticEmbeddedServer.startElasticEmbeddedServer();
+  }
+
+  /**
+   * Test to read data from embedded Elasticsearch instance and verify whether data is read
+   * successfully.
+   */
+  @Test
+  public void testHifIOWithElastic() {
+    // Expected hashcode is evaluated during insertion time one time and hardcoded here.
+    String expectedHashCode = "e2098f431f90193aa4545e033e6fd2217aafe7b6";
+    Configuration conf = getConfiguration();
+    PCollection<KV<Text, LinkedMapWritable>> esData =
+        pipeline.apply(HadoopInputFormatIO.<Text, LinkedMapWritable>read().withConfiguration(conf));
+    PCollection<Long> count = esData.apply(Count.<KV<Text, LinkedMapWritable>>globally());
+    // Verify that the count of objects fetched using HIFInputFormat IO is correct.
+    PAssert.thatSingleton(count).isEqualTo((long) TEST_DATA_ROW_COUNT);
+    PCollection<LinkedMapWritable> values = esData.apply(Values.<LinkedMapWritable>create());
+    PCollection<String> textValues = values.apply(transformFunc);
+    // Verify the output values using checksum comparison.
+    PCollection<String> consolidatedHashcode =
+        textValues.apply(Combine.globally(new HashingFn()).withoutDefaults());
+    PAssert.that(consolidatedHashcode).containsInAnyOrder(expectedHashCode);
+    pipeline.run().waitUntilFinish();
+  }
+
+  MapElements<LinkedMapWritable, String> transformFunc =
+      MapElements.<LinkedMapWritable, String>via(new SimpleFunction<LinkedMapWritable, String>() {
+        @Override
+        public String apply(LinkedMapWritable mapw) {
+          return mapw.get(new Text("id")) + "|" + mapw.get(new Text("scientist"));
+        }
+      });
+  /**
+   * Test to read data from embedded Elasticsearch instance based on query and verify whether data
+   * is read successfully.
+   */
+  @Test
+  public void testHifIOWithElasticQuery() {
+    long expectedRowCount = 1L;
+    String expectedHashCode = "caa37dbd8258e3a7f98932958c819a57aab044ec";
+    Configuration conf = getConfiguration();
+    String fieldValue = ELASTIC_TYPE_ID_PREFIX + "2";
+    String query = "{"
+                  + "  \"query\": {"
+                  + "  \"match\" : {"
+                  + "    \"id\" : {"
+                  + "      \"query\" : \"" + fieldValue + "\","
+                  + "      \"type\" : \"boolean\""
+                  + "    }"
+                  + "  }"
+                  + "  }"
+                  + "}";
+    conf.set(ConfigurationOptions.ES_QUERY, query);
+    PCollection<KV<Text, LinkedMapWritable>> esData =
+        pipeline.apply(HadoopInputFormatIO.<Text, LinkedMapWritable>read().withConfiguration(conf));
+    PCollection<Long> count = esData.apply(Count.<KV<Text, LinkedMapWritable>>globally());
+    // Verify that the count of objects fetched using HIFInputFormat IO is correct.
+    PAssert.thatSingleton(count).isEqualTo(expectedRowCount);
+    PCollection<LinkedMapWritable> values = esData.apply(Values.<LinkedMapWritable>create());
+    PCollection<String> textValues = values.apply(transformFunc);
+    // Verify the output values using checksum comparison.
+    PCollection<String> consolidatedHashcode =
+        textValues.apply(Combine.globally(new HashingFn()).withoutDefaults());
+    PAssert.that(consolidatedHashcode).containsInAnyOrder(expectedHashCode);
+    pipeline.run().waitUntilFinish();
+  }
+
+  /**
+   * Set the Elasticsearch configuration parameters in the Hadoop configuration object.
+   * Configuration object should have InputFormat class, key class and value class set. Mandatory
+   * fields for ESInputFormat to be set are es.resource, es.nodes, es.port, es.internal.es.version.
+   * Please refer to
+   * <a href="https://www.elastic.co/guide/en/elasticsearch/hadoop/current/configuration.html"
+   * >Elasticsearch Configuration</a> for more details.
+   */
+  public Configuration getConfiguration() {
+    Configuration conf = new Configuration();
+    conf.set(ConfigurationOptions.ES_NODES, ELASTIC_IN_MEM_HOSTNAME);
+    conf.set(ConfigurationOptions.ES_PORT, String.format("%s", ELASTIC_IN_MEM_PORT));
+    conf.set(ConfigurationOptions.ES_RESOURCE, ELASTIC_RESOURCE);
+    conf.set("es.internal.es.version", ELASTIC_INTERNAL_VERSION);
+    conf.set(ConfigurationOptions.ES_NODES_DISCOVERY, TRUE);
+    conf.set(ConfigurationOptions.ES_INDEX_AUTO_CREATE, TRUE);
+    conf.setClass("mapreduce.job.inputformat.class",
+        org.elasticsearch.hadoop.mr.EsInputFormat.class, InputFormat.class);
+    conf.setClass("key.class", Text.class, Object.class);
+    conf.setClass("value.class", LinkedMapWritable.class, Object.class);
+    return conf;
+ }
+
+  private static Map<String, String> createElasticRow(String id, String name) {
+    Map<String, String> data = new HashMap<String, String>();
+    data.put("id", id);
+    data.put("scientist", name);
+    return data;
+  }
+
+  @AfterClass
+  public static void shutdownServer() throws IOException {
+    ElasticEmbeddedServer.shutdown();
+  }
+
+  /**
+   * Class for in memory Elasticsearch server.
+   */
+  static class ElasticEmbeddedServer implements Serializable {
+    private static final long serialVersionUID = 1L;
+    private static Node node;
+
+    public static void startElasticEmbeddedServer()
+        throws NodeValidationException, InterruptedException {
+      Settings settings = Settings.builder()
+          .put("node.data", TRUE)
+          .put("network.host", ELASTIC_IN_MEM_HOSTNAME)
+          .put("http.port", ELASTIC_IN_MEM_PORT)
+          .put("path.data", elasticTempFolder.getRoot().getPath())
+          .put("path.home", elasticTempFolder.getRoot().getPath())
+          .put("transport.type", "local")
+          .put("http.enabled", TRUE)
+          .put("node.ingest", TRUE).build();
+      node = new PluginNode(settings);
+      node.start();
+      LOGGER.info("Elastic in memory server started.");
+      prepareElasticIndex();
+      LOGGER.info("Prepared index " + ELASTIC_INDEX_NAME
+          + "and populated data on elastic in memory server.");
+    }
+
+    /**
+     * Prepares Elastic index, by adding rows.
+     */
+    private static void prepareElasticIndex() throws InterruptedException {
+      CreateIndexRequest indexRequest = new CreateIndexRequest(ELASTIC_INDEX_NAME);
+      node.client().admin().indices().create(indexRequest).actionGet();
+      for (int i = 0; i < TEST_DATA_ROW_COUNT; i++) {
+        node.client().prepareIndex(ELASTIC_INDEX_NAME, ELASTIC_TYPE_NAME, String.valueOf(i))
+            .setSource(createElasticRow(ELASTIC_TYPE_ID_PREFIX + i, "Faraday" + i)).execute()
+            .actionGet();
+      }
+      node.client().admin().indices().prepareRefresh(ELASTIC_INDEX_NAME).get();
+    }
+    /**
+     * Shutdown the embedded instance.
+     * @throws IOException
+     */
+    public static void shutdown() throws IOException {
+      DeleteIndexRequest indexRequest = new DeleteIndexRequest(ELASTIC_INDEX_NAME);
+      node.client().admin().indices().delete(indexRequest).actionGet();
+      LOGGER.info("Deleted index " + ELASTIC_INDEX_NAME + " from elastic in memory server");
+      node.close();
+      LOGGER.info("Closed elastic in memory server node.");
+      deleteElasticDataDirectory();
+    }
+
+    private static void deleteElasticDataDirectory() {
+      try {
+        FileUtils.deleteDirectory(new File(elasticTempFolder.getRoot().getPath()));
+      } catch (IOException e) {
+        throw new RuntimeException("Could not delete elastic data directory: " + e.getMessage(), e);
+      }
+    }
+  }
+
+  /**
+   * Class created for handling "http.enabled" property as "true" for Elasticsearch node.
+   */
+  static class PluginNode extends Node implements Serializable {
+
+    private static final long serialVersionUID = 1L;
+    static Collection<Class<? extends Plugin>> list = new ArrayList<Class<? extends Plugin>>();
+    static {
+      list.add(Netty4Plugin.class);
+    }
+
+    public PluginNode(final Settings settings) {
+      super(InternalSettingsPreparer.prepareEnvironment(settings, null), list);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop/jdk1.8-tests/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/custom/options/HIFTestOptions.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop/jdk1.8-tests/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/custom/options/HIFTestOptions.java b/sdks/java/io/hadoop/jdk1.8-tests/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/custom/options/HIFTestOptions.java
new file mode 100644
index 0000000..2e89ed1
--- /dev/null
+++ b/sdks/java/io/hadoop/jdk1.8-tests/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/custom/options/HIFTestOptions.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.sdk.io.hadoop.inputformat.custom.options;
+
+import org.apache.beam.sdk.options.Default;
+import org.apache.beam.sdk.options.Description;
+import org.apache.beam.sdk.testing.TestPipelineOptions;
+
+/**
+ * Properties needed when using HadoopInputFormatIO with the Beam SDK.
+ */
+public interface HIFTestOptions extends TestPipelineOptions {
+
+  //Cassandra test options
+  @Description("Cassandra Server IP")
+  @Default.String("cassandraServerIp")
+  String getCassandraServerIp();
+  void setCassandraServerIp(String cassandraServerIp);
+  @Description("Cassandra Server port")
+  @Default.Integer(0)
+  Integer getCassandraServerPort();
+  void setCassandraServerPort(Integer cassandraServerPort);
+  @Description("Cassandra User name")
+  @Default.String("cassandraUserName")
+  String getCassandraUserName();
+  void setCassandraUserName(String cassandraUserName);
+  @Description("Cassandra Password")
+  @Default.String("cassandraPassword")
+  String getCassandraPassword();
+  void setCassandraPassword(String cassandraPassword);
+
+  //Elasticsearch test options
+  @Description("Elasticsearch Server IP")
+  @Default.String("elasticServerIp")
+  String getElasticServerIp();
+  void setElasticServerIp(String elasticServerIp);
+  @Description("Elasticsearch Server port")
+  @Default.Integer(0)
+  Integer getElasticServerPort();
+  void setElasticServerPort(Integer elasticServerPort);
+  @Description("Elasticsearch User name")
+  @Default.String("elasticUserName")
+  String getElasticUserName();
+  void setElasticUserName(String elasticUserName);
+  @Description("Elastic Password")
+  @Default.String("elasticPassword")
+  String getElasticPassword();
+  void setElasticPassword(String elasticPassword);
+}

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop/jdk1.8-tests/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/hashing/HashingFn.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop/jdk1.8-tests/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/hashing/HashingFn.java b/sdks/java/io/hadoop/jdk1.8-tests/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/hashing/HashingFn.java
new file mode 100644
index 0000000..fe37048
--- /dev/null
+++ b/sdks/java/io/hadoop/jdk1.8-tests/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/hashing/HashingFn.java
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package org.apache.beam.sdk.io.hadoop.inputformat.hashing;
+
+import com.google.common.collect.Lists;
+import com.google.common.hash.HashCode;
+import com.google.common.hash.Hashing;
+
+import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.io.Serializable;
+import java.nio.charset.StandardCharsets;
+import java.util.List;
+
+import org.apache.beam.sdk.coders.CannotProvideCoderException;
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.coders.CoderRegistry;
+import org.apache.beam.sdk.coders.SerializableCoder;
+import org.apache.beam.sdk.transforms.Combine.CombineFn;
+
+/**
+ * Custom Function for Hashing. The combiner is combineUnordered, and accumulator is a
+ * HashCode.
+ */
+public class HashingFn extends CombineFn<String, HashingFn.Accum, String> {
+
+  /**
+   * Serializable Class to store the HashCode of input String.
+   */
+  public static class Accum implements Serializable {
+    HashCode hashCode = null;
+
+    public Accum(HashCode value) {
+      this.hashCode = value;
+    }
+
+    private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
+      in.defaultReadObject();
+    }
+
+    private void writeObject(ObjectOutputStream out) throws IOException {
+      out.defaultWriteObject();
+    }
+  }
+
+  @Override
+  public Accum addInput(Accum accum, String input) {
+    List<HashCode> elementHashes = Lists.newArrayList();
+     if (accum.hashCode != null) {
+      elementHashes.add(accum.hashCode);
+    }
+    HashCode inputHashCode = Hashing.sha1().hashString(input, StandardCharsets.UTF_8);
+    elementHashes.add(inputHashCode);
+    accum.hashCode = Hashing.combineUnordered(elementHashes);
+    return accum;
+  }
+
+  @Override
+  public Accum mergeAccumulators(Iterable<Accum> accums) {
+    Accum merged = createAccumulator();
+    List<HashCode> elementHashes = Lists.newArrayList();
+    for (Accum accum : accums) {
+      if (accum.hashCode != null) {
+        elementHashes.add(accum.hashCode);
+      }
+    }
+    merged.hashCode = Hashing.combineUnordered(elementHashes);
+    return merged;
+  }
+
+  @Override
+  public String extractOutput(Accum accum) {
+    // Return the combined hash code of list of elements in the Pcollection.
+    String consolidatedHash = "";
+    if (accum.hashCode != null) {
+      consolidatedHash = accum.hashCode.toString();
+    }
+    return consolidatedHash;
+  }
+
+  @Override
+  public Coder<Accum> getAccumulatorCoder(CoderRegistry registry, Coder<String> inputCoder)
+      throws CannotProvideCoderException {
+    return SerializableCoder.of(Accum.class);
+  }
+
+  @Override
+  public Coder<String> getDefaultOutputCoder(CoderRegistry registry, Coder<String> inputCoder) {
+    return inputCoder;
+  }
+
+  @Override
+  public Accum createAccumulator() {
+    return new Accum(null);
+  }
+}

[6/7] beam git commit: HadoopInputFormatIO with junits

Posted by da...@apache.org.

HadoopInputFormatIO with junits


Project: http://git-wip-us.apache.org/repos/asf/beam/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/174436bc
Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/174436bc
Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/174436bc

Branch: refs/heads/master
Commit: 174436bcff6294499aafb4b4d22bbe0b19270c19
Parents: 9c284d6
Author: Radhika S Kulkarni <ra...@persistent.co.in>
Authored: Mon Mar 6 19:41:45 2017 +0530
Committer: Davor Bonaci <da...@google.com>
Committed: Thu Apr 6 16:30:12 2017 +0200

----------------------------------------------------------------------
 pom.xml                                         |    6 +
 sdks/java/io/hadoop-input-format/README.md      |  167 ---
 sdks/java/io/hadoop-input-format/pom.xml        |  136 ---
 .../hadoop/inputformat/HadoopInputFormatIO.java |  941 ---------------
 .../sdk/io/hadoop/inputformat/package-info.java |   23 -
 .../ConfigurableEmployeeInputFormat.java        |  131 ---
 .../sdk/io/hadoop/inputformat/Employee.java     |   85 --
 .../hadoop/inputformat/EmployeeInputFormat.java |  172 ---
 .../inputformat/HadoopInputFormatIOTest.java    |  844 --------------
 .../ReuseObjectsEmployeeInputFormat.java        |  176 ---
 .../hadoop/inputformat/TestEmployeeDataSet.java |   76 --
 sdks/java/io/hadoop/README.md                   |  167 +++
 sdks/java/io/hadoop/input-format/pom.xml        |   98 ++
 .../hadoop/inputformat/HadoopInputFormatIO.java |  842 ++++++++++++++
 .../sdk/io/hadoop/inputformat/package-info.java |   23 +
 .../ConfigurableEmployeeInputFormat.java        |  131 +++
 .../sdk/io/hadoop/inputformat/Employee.java     |   85 ++
 .../hadoop/inputformat/EmployeeInputFormat.java |  172 +++
 .../inputformat/HadoopInputFormatIOTest.java    |  797 +++++++++++++
 .../ReuseObjectsEmployeeInputFormat.java        |  176 +++
 .../hadoop/inputformat/TestEmployeeDataSet.java |   76 ++
 sdks/java/io/hadoop/jdk1.8-tests/pom.xml        |  278 +++++
 .../inputformat/HIFIOWithElasticTest.java       |  277 +++++
 .../custom/options/HIFTestOptions.java          |   64 ++
 .../hadoop/inputformat/hashing/HashingFn.java   |  109 ++
 .../integration/tests/HIFIOCassandraIT.java     |  173 +++
 .../integration/tests/HIFIOElasticIT.java       |  215 ++++
 .../src/test/resources/cassandra.yaml           | 1074 ++++++++++++++++++
 .../SmallITCluster/cassandra-svc-rc.yaml        |   88 ++
 .../cassandra/SmallITCluster/start-up.sh        |   21 +
 .../cassandra/SmallITCluster/teardown.sh        |   21 +
 .../kubernetes/cassandra/data-load-setup.sh     |   29 +
 .../resources/kubernetes/cassandra/data-load.sh |   67 ++
 .../LargeProductionCluster/es-services.yaml     |  277 +++++
 .../LargeProductionCluster/start-up.sh          |   21 +
 .../LargeProductionCluster/teardown.sh          |   20 +
 .../SmallITCluster/elasticsearch-svc-rc.yaml    |   84 ++
 .../elasticsearch/SmallITCluster/start-up.sh    |   22 +
 .../elasticsearch/SmallITCluster/teardown.sh    |   20 +
 .../kubernetes/elasticsearch/data-load-setup.sh |   26 +
 .../kubernetes/elasticsearch/data-load.sh       |   33 +
 .../kubernetes/elasticsearch/es_test_data.py    |  299 +++++
 .../kubernetes/elasticsearch/show-health.sh     |   25 +
 sdks/java/io/hadoop/pom.xml                     |   53 +
 sdks/java/io/pom.xml                            |    2 +-
 45 files changed, 5870 insertions(+), 2752 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 3f53b1f..09f3985 100644
--- a/pom.xml
+++ b/pom.xml
@@ -446,6 +446,12 @@
         <version>${project.version}</version>
       </dependency>
 
+	  <dependency>
+        <groupId>org.apache.beam</groupId>
+        <artifactId>beam-sdks-java-io-hadoop-input-format</artifactId>
+	    <version>${project.version}</version>
+      </dependency>
+	
       <dependency>
         <groupId>org.apache.beam</groupId>
         <artifactId>beam-runners-core-construction-java</artifactId>

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop-input-format/README.md
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop-input-format/README.md b/sdks/java/io/hadoop-input-format/README.md
deleted file mode 100644
index d91f019..0000000
--- a/sdks/java/io/hadoop-input-format/README.md
+++ /dev/null
@@ -1,167 +0,0 @@
-<!--
-    Licensed to the Apache Software Foundation (ASF) under one
-    or more contributor license agreements.  See the NOTICE file
-    distributed with this work for additional information
-    regarding copyright ownership.  The ASF licenses this file
-    to you under the Apache License, Version 2.0 (the
-    "License"); you may not use this file except in compliance
-    with the License.  You may obtain a copy of the License at
-
-      http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing,
-    software distributed under the License is distributed on an
-    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-    KIND, either express or implied.  See the License for the
-    specific language governing permissions and limitations
-    under the License.
--->
-
-# Hadoop InputFormat IO
-
-A HadoopInputFormatIO is a Transform for reading data from any source which
-implements Hadoop InputFormat. For example- Cassandra, Elasticsearch, HBase, Redis, Postgres, etc.
-
-HadoopInputFormatIO has to make several performance trade-offs in connecting to InputFormat, so if there is another Beam IO Transform specifically for connecting to your data source of choice, we would recommend using that one, but this IO Transform allows you to connect to many data sources that do not yet have a Beam IO Transform.
-
-You will need to pass a Hadoop Configuration with parameters specifying how the read will occur. Many properties of the Configuration are optional, and some are required for certain InputFormat classes, but the following properties must be set for all InputFormats:
-
-mapreduce.job.inputformat.class: The InputFormat class used to connect to your data source of choice.
-key.class: The key class returned by the InputFormat in 'mapreduce.job.inputformat.class'.
-value.class: The value class returned by the InputFormat in 'mapreduce.job.inputformat.class'.
-
-For example:
-```java
-Configuration myHadoopConfiguration = new Configuration(false);
-// Set Hadoop InputFormat, key and value class in configuration
-myHadoopConfiguration.setClass("mapreduce.job.inputformat.class", InputFormatClass,
-  InputFormat.class);
-myHadoopConfiguration.setClass("key.class", InputFormatKeyClass, Object.class);
-myHadoopConfiguration.setClass("value.class", InputFormatValueClass, Object.class);
-```
-
-You will need to check to see if the key and value classes output by the InputFormat have a Beam Coder available. If not, You can use withKeyTranslation/withValueTranslation to specify a method transforming instances of those classes into another class that is supported by a Beam Coder. These settings are optional and you don't need to specify translation for both key and value.
-
-For example:
-```java
-SimpleFunction<InputFormatKeyClass, MyKeyClass> myOutputKeyType =
-new SimpleFunction<InputFormatKeyClass, MyKeyClass>() {
-  public MyKeyClass apply(InputFormatKeyClass input) {
-  // ...logic to transform InputFormatKeyClass to MyKeyClass
-  }
-};
-SimpleFunction<InputFormatValueClass, MyValueClass> myOutputValueType =
-new SimpleFunction<InputFormatValueClass, MyValueClass>() {
-  public MyValueClass apply(InputFormatValueClass input) {
-  // ...logic to transform InputFormatValueClass to MyValueClass
-  }
-};
-```
-
-### Reading using Hadoop InputFormat IO
-Pipeline p = ...; // Create pipeline.
-// Read data only with Hadoop configuration.
-
-```java
-p.apply("read",
-  HadoopInputFormatIO.<InputFormatKeyClass, InputFormatKeyClass>read()
-  .withConfiguration(myHadoopConfiguration);
-```
-
-// Read data with configuration and key translation (Example scenario: Beam Coder is not
-available for key class hence key translation is required.).
-
-```java
-p.apply("read",
-  HadoopInputFormatIO.<MyKeyClass, InputFormatKeyClass>read()
-  .withConfiguration(myHadoopConfiguration)
-  .withKeyTranslation(myOutputKeyType);
-```
-
-// Read data with configuration and value translation (Example scenario: Beam Coder is not
-available for value class hence value translation is required.).
-
-```java
-p.apply("read",
-  HadoopInputFormatIO.<InputFormatKeyClass, MyValueClass>read()
-  .withConfiguration(myHadoopConfiguration)
-  .withValueTranslation(myOutputValueType);
-```
-
-// Read data with configuration, value translation and key translation (Example scenario: Beam Coders are not available for both key class and value class of InputFormat hence key and value translation is required.).
-
-```java
-p.apply("read",
-  HadoopInputFormatIO.<MyKeyClass, MyValueClass>read()
-  .withConfiguration(myHadoopConfiguration)
-  .withKeyTranslation(myOutputKeyType)
-  .withValueTranslation(myOutputValueType);
-```
-
-# Examples for specific InputFormats
-
-### Cassandra - CqlInputFormat
-
-To read data from Cassandra, org.apache.cassandra.hadoop.cql3.CqlInputFormat
-CqlInputFormat can be used which needs following properties to be set.
-
-Create Cassandra Hadoop configuration as follows:
-
-```java
-Configuration cassandraConf = new Configuration();
-cassandraConf.set("cassandra.input.thrift.port", "9160");
-cassandraConf.set("cassandra.input.thrift.address", CassandraHostIp);
-cassandraConf.set("cassandra.input.partitioner.class", "Murmur3Partitioner");
-cassandraConf.set("cassandra.input.keyspace", "myKeySpace");
-cassandraConf.set("cassandra.input.columnfamily", "myColumnFamily");
-cassandraConf.setClass("key.class", java.lang.Long Long.class, Object.class);
-cassandraConf.setClass("value.class", com.datastax.driver.core.Row Row.class, Object.class);
-cassandraConf.setClass("mapreduce.job.inputformat.class", org.apache.cassandra.hadoop.cql3.CqlInputFormat CqlInputFormat.class, InputFormat.class);
-```
-
-Call Read transform as follows:
-
-```java
-PCollection<KV<Long, String>> cassandraData =
-  p.apply("read",
-  HadoopInputFormatIO.<Long, String>read()
-  .withConfiguration(cassandraConf)
-  .withValueTranslation(cassandraOutputValueType);
-```
-
-The CqlInputFormat key class is java.lang.Long Long, which has a Beam Coder. The CqlInputFormat value class is com.datastax.driver.core.Row Row, which does not have a Beam Coder. Rather than write a new coder, you can provide your own translation method as follows:
-
-```java
-SimpleFunction<Row, String> cassandraOutputValueType = SimpleFunction<Row, String>()
-{
-  public String apply(Row row) {
-    return row.getString('myColName');
-  }
-};
-```
- 
-### Elasticsearch - EsInputFormat
- 
-To read data from Elasticsearch, EsInputFormat can be used which needs following properties to be set.
- 
-Create ElasticSearch Hadoop configuration as follows:
-
-```java
-Configuration elasticSearchConf = new Configuration();
-elasticSearchConf.set("es.nodes", ElasticsearchHostIp);
-elasticSearchConf.set("es.port", "9200");
-elasticSearchConf.set("es.resource", "ElasticIndexName/ElasticTypeName");
-elasticSearchConf.setClass("key.class", org.apache.hadoop.io.Text Text.class, Object.class);
-elasticSearchConf.setClass("value.class", org.elasticsearch.hadoop.mr.LinkedMapWritable LinkedMapWritable.class, Object.class);
-elasticSearchConf.setClass("mapreduce.job.inputformat.class", org.elasticsearch.hadoop.mr.EsInputFormat EsInputFormat.class, InputFormat.class);
-```
-
-Call Read transform as follows:
-
-```java
-PCollection<KV<Text, LinkedMapWritable>> elasticData = p.apply("read",
-  HadoopInputFormatIO.<Text, LinkedMapWritable>read().withConfiguration(elasticSearchConf));
-```
-
-The org.elasticsearch.hadoop.mr.EsInputFormat EsInputFormat key class is
-org.apache.hadoop.io.Text Text and value class is org.elasticsearch.hadoop.mr.LinkedMapWritable LinkedMapWritable. Both key and value classes have Beam Coders.
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop-input-format/pom.xml
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop-input-format/pom.xml b/sdks/java/io/hadoop-input-format/pom.xml
deleted file mode 100644
index 6680087..0000000
--- a/sdks/java/io/hadoop-input-format/pom.xml
+++ /dev/null
@@ -1,136 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
-    Licensed to the Apache Software Foundation (ASF) under one or more
-    contributor license agreements.  See the NOTICE file distributed with
-    this work for additional information regarding copyright ownership.
-    The ASF licenses this file to You under the Apache License, Version 2.0
-    (the "License"); you may not use this file except in compliance with
-    the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software
-    distributed under the License is distributed on an "AS IS" BASIS,
-    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-    See the License for the specific language governing permissions and
-    limitations under the License.
--->
-<project
-  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
-  xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
-  <modelVersion>4.0.0</modelVersion>
-  <parent>
-    <groupId>org.apache.beam</groupId>
-    <artifactId>beam-sdks-java-io-parent</artifactId>
-    <version>0.7.0-SNAPSHOT</version>
-    <relativePath>../pom.xml</relativePath>
-  </parent>
-
-  <artifactId>beam-sdks-java-io-hadoop-input-format</artifactId>
-  <name>Apache Beam :: SDKs :: Java :: IO :: Hadoop Input Format</name>
-  <description>IO library to read data from data sources which implement Hadoop Input Format from Beam.</description>
-
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-compiler-plugin</artifactId>
-      </plugin>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-surefire-plugin</artifactId>
-      </plugin>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-jar-plugin</artifactId>
-      </plugin>
-    </plugins>
-  </build>
-
-  <properties>
-    <log4j.core.version>2.6.2</log4j.core.version>
-    <hadoop.common.version>2.7.0</hadoop.common.version>
-    <findbugs.jsr305.version>3.0.1</findbugs.jsr305.version>
-    <slf4j.api.version>1.7.14</slf4j.api.version>
-    <guava.version>19.0</guava.version>
-  </properties>
-
-  <dependencies>
-    <dependency>
-      <groupId>org.apache.beam</groupId>
-      <artifactId>beam-sdks-java-core</artifactId>
-    </dependency>
-    <dependency>
-      <groupId>com.google.guava</groupId>
-      <artifactId>guava</artifactId>
-      <version>${guava.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.slf4j</groupId>
-      <artifactId>slf4j-api</artifactId>
-      <version>${slf4j.api.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>com.google.code.findbugs</groupId>
-      <artifactId>jsr305</artifactId>
-      <version>${findbugs.jsr305.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.beam</groupId>
-      <artifactId>beam-sdks-java-io-hadoop-common</artifactId>
-    </dependency>
-
-    <!-- compile dependencies -->
-    <dependency>
-      <groupId>com.google.auto.value</groupId>
-      <artifactId>auto-value</artifactId>
-      <scope>provided</scope>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.hadoop</groupId>
-      <artifactId>hadoop-common</artifactId>
-      <version>${hadoop.common.version}</version>
-      <scope>provided</scope>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.hadoop</groupId>
-      <artifactId>hadoop-mapreduce-client-core</artifactId>
-      <version>${hadoop.common.version}</version>
-      <scope>provided</scope>
-    </dependency>
-
-    <!-- test dependencies -->
-    <dependency>
-      <groupId>org.apache.beam</groupId>
-      <artifactId>beam-sdks-java-core</artifactId>
-      <classifier>tests</classifier>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.beam</groupId>
-      <artifactId>beam-runners-direct-java</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.logging.log4j</groupId>
-      <artifactId>log4j-core</artifactId>
-      <version>${log4j.core.version}</version>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>org.hamcrest</groupId>
-      <artifactId>hamcrest-all</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>org.mockito</groupId>
-      <artifactId>mockito-all</artifactId>
-      <scope>test</scope>
-    </dependency>
-  </dependencies>
-</project>

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop-input-format/src/main/java/org/apache/beam/sdk/io/hadoop/inputformat/HadoopInputFormatIO.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop-input-format/src/main/java/org/apache/beam/sdk/io/hadoop/inputformat/HadoopInputFormatIO.java b/sdks/java/io/hadoop-input-format/src/main/java/org/apache/beam/sdk/io/hadoop/inputformat/HadoopInputFormatIO.java
deleted file mode 100644
index 3b786fb..0000000
--- a/sdks/java/io/hadoop-input-format/src/main/java/org/apache/beam/sdk/io/hadoop/inputformat/HadoopInputFormatIO.java
+++ /dev/null
@@ -1,941 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
- * agreements. See the NOTICE file distributed with this work for additional information regarding
- * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance with the License. You may obtain a
- * copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software distributed under the License
- * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
- * or implied. See the License for the specific language governing permissions and limitations under
- * the License.
- */
-package org.apache.beam.sdk.io.hadoop.inputformat;
-
-import static com.google.common.base.Preconditions.checkArgument;
-import static com.google.common.base.Preconditions.checkNotNull;
-
-import com.google.auto.value.AutoValue;
-import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Function;
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.Lists;
-import com.google.common.util.concurrent.AtomicDouble;
-
-import java.io.Externalizable;
-import java.io.IOException;
-import java.io.ObjectInput;
-import java.io.ObjectInputStream;
-import java.io.ObjectOutput;
-import java.io.ObjectOutputStream;
-import java.io.Serializable;
-import java.lang.reflect.ParameterizedType;
-import java.lang.reflect.Type;
-import java.math.BigDecimal;
-import java.math.BigInteger;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map.Entry;
-import java.util.Set;
-
-import javax.annotation.Nullable;
-
-import org.apache.beam.sdk.coders.CannotProvideCoderException;
-import org.apache.beam.sdk.coders.Coder;
-import org.apache.beam.sdk.coders.CoderException;
-import org.apache.beam.sdk.coders.CoderRegistry;
-import org.apache.beam.sdk.coders.KvCoder;
-import org.apache.beam.sdk.io.BoundedSource;
-import org.apache.beam.sdk.io.hadoop.WritableCoder;
-import org.apache.beam.sdk.options.PipelineOptions;
-import org.apache.beam.sdk.transforms.PTransform;
-import org.apache.beam.sdk.transforms.SimpleFunction;
-import org.apache.beam.sdk.transforms.display.DisplayData;
-import org.apache.beam.sdk.util.CoderUtils;
-import org.apache.beam.sdk.values.KV;
-import org.apache.beam.sdk.values.PBegin;
-import org.apache.beam.sdk.values.PCollection;
-import org.apache.beam.sdk.values.TypeDescriptor;
-import org.apache.hadoop.conf.Configurable;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.ObjectWritable;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.mapreduce.InputFormat;
-import org.apache.hadoop.mapreduce.InputSplit;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.RecordReader;
-import org.apache.hadoop.mapreduce.TaskAttemptContext;
-import org.apache.hadoop.mapreduce.TaskAttemptID;
-import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * A {@link HadoopInputFormatIO} is a Transform for reading data from any source which
- * implements Hadoop {@link InputFormat}. For example- Cassandra, Elasticsearch, HBase, Redis,
- * Postgres etc. {@link HadoopInputFormatIO} has to make several performance trade-offs in
- * connecting to {@link InputFormat}, so if there is another Beam IO Transform specifically for
- * connecting to your data source of choice, we would recommend using that one, but this IO
- * Transform allows you to connect to many data sources that do not yet have a Beam IO Transform.
- *
- * <p>You will need to pass a Hadoop {@link Configuration} with parameters specifying how the read
- * will occur. Many properties of the Configuration are optional, and some are required for certain
- * {@link InputFormat} classes, but the following properties must be set for all InputFormats:
- * <ul>
- * <li>{@code mapreduce.job.inputformat.class}: The {@link InputFormat} class used to connect to
- * your data source of choice.</li>
- * <li>{@code key.class}: The key class returned by the {@link InputFormat} in
- * {@code mapreduce.job.inputformat.class}.</li>
- * <li>{@code value.class}: The value class returned by the {@link InputFormat} in
- * {@code mapreduce.job.inputformat.class}.</li>
- * </ul>
- * For example:
- *
- * <pre>
- * {
- *   Configuration myHadoopConfiguration = new Configuration(false);
- *   // Set Hadoop InputFormat, key and value class in configuration
- *   myHadoopConfiguration.setClass(&quot;mapreduce.job.inputformat.class&quot;,
- *      MyDbInputFormatClass, InputFormat.class);
- *   myHadoopConfiguration.setClass(&quot;key.class&quot;, MyDbInputFormatKeyClass, Object.class);
- *   myHadoopConfiguration.setClass(&quot;value.class&quot;,
- *      MyDbInputFormatValueClass, Object.class);
- * }
- * </pre>
- *
- * <p>You will need to check to see if the key and value classes output by the {@link InputFormat}
- * have a Beam {@link Coder} available. If not, you can use withKeyTranslation/withValueTranslation
- * to specify a method transforming instances of those classes into another class that is supported
- * by a Beam {@link Coder}. These settings are optional and you don't need to specify translation
- * for both key and value. If you specify a translation, you will need to make sure the K or V of
- * the read transform match the output type of the translation.
- *
- * <h3>Reading using {@link HadoopInputFormatIO}</h3>
- *
- * <pre>
- * {@code
- * Pipeline p = ...; // Create pipeline.
- * // Read data only with Hadoop configuration.
- * p.apply("read",
- *     HadoopInputFormatIO.<InputFormatKeyClass, InputFormatKeyClass>read()
- *              .withConfiguration(myHadoopConfiguration);
- * }
- * // Read data with configuration and key translation (Example scenario: Beam Coder is not
- * available for key class hence key translation is required.).
- * SimpleFunction&lt;InputFormatKeyClass, MyKeyClass&gt; myOutputKeyType =
- *       new SimpleFunction&lt;InputFormatKeyClass, MyKeyClass&gt;() {
- *         public MyKeyClass apply(InputFormatKeyClass input) {
- *           // ...logic to transform InputFormatKeyClass to MyKeyClass
- *         }
- * };
- * </pre>
- *
- * <pre>
- * {@code
- * p.apply("read",
- *     HadoopInputFormatIO.<MyKeyClass, InputFormatKeyClass>read()
- *              .withConfiguration(myHadoopConfiguration)
- *              .withKeyTranslation(myOutputKeyType);
- * }
- * </pre>
- *
- * <p>// Read data with configuration and value translation (Example scenario: Beam Coder is not
- * available for value class hence value translation is required.).
- *
- * <pre>
- * {@code
- * SimpleFunction&lt;InputFormatValueClass, MyValueClass&gt; myOutputValueType =
- *      new SimpleFunction&lt;InputFormatValueClass, MyValueClass&gt;() {
- *          public MyValueClass apply(InputFormatValueClass input) {
- *            // ...logic to transform InputFormatValueClass to MyValueClass
- *          }
- *  };
- * }
- * </pre>
- *
- * <pre>
- * {@code
- * p.apply("read",
- *     HadoopInputFormatIO.<InputFormatKeyClass, MyValueClass>read()
- *              .withConfiguration(myHadoopConfiguration)
- *              .withValueTranslation(myOutputValueType);
- * }
- * </pre>
- */
-
-public class HadoopInputFormatIO {
-  private static final Logger LOG = LoggerFactory.getLogger(HadoopInputFormatIO.class);
-
-  /**
-   * Creates an uninitialized {@link HadoopInputFormatIO.Read}. Before use, the {@code Read} must
-   * be initialized with a HadoopInputFormatIO.Read#withConfiguration(HadoopConfiguration) that
-   * specifies the source. A key/value translation may also optionally be specified using
-   * {@link HadoopInputFormatIO.Read#withKeyTranslation}/
-   * {@link HadoopInputFormatIO.Read#withValueTranslation}.
-   */
-  public static <K, V> Read<K, V> read() {
-    return new AutoValue_HadoopInputFormatIO_Read.Builder<K, V>().build();
-  }
-
-  /**
-   * A {@link PTransform} that reads from any data source which implements Hadoop InputFormat. For
-   * e.g. Cassandra, Elasticsearch, HBase, Redis, Postgres, etc. See the class-level Javadoc on
-   * {@link HadoopInputFormatIO} for more information.
-   * @param <K> Type of keys to be read.
-   * @param <V> Type of values to be read.
-   * @see HadoopInputFormatIO
-   */
-  @AutoValue
-  public abstract static class Read<K, V> extends PTransform<PBegin, PCollection<KV<K, V>>> {
-
-    // Returns the Hadoop Configuration which contains specification of source.
-    @Nullable
-    public abstract SerializableConfiguration getConfiguration();
-
-    @Nullable public abstract SimpleFunction<?, K> getKeyTranslationFunction();
-    @Nullable public abstract SimpleFunction<?, V> getValueTranslationFunction();
-    @Nullable public abstract TypeDescriptor<K> getKeyTypeDescriptor();
-    @Nullable public abstract TypeDescriptor<V> getValueTypeDescriptor();
-    @Nullable public abstract TypeDescriptor<?> getinputFormatClass();
-    @Nullable public abstract TypeDescriptor<?> getinputFormatKeyClass();
-    @Nullable public abstract TypeDescriptor<?> getinputFormatValueClass();
-
-    abstract Builder<K, V> toBuilder();
-
-    @AutoValue.Builder
-    abstract static class Builder<K, V> {
-      abstract Builder<K, V> setConfiguration(SerializableConfiguration configuration);
-      abstract Builder<K, V> setKeyTranslationFunction(SimpleFunction<?, K> function);
-      abstract Builder<K, V> setValueTranslationFunction(SimpleFunction<?, V> function);
-      abstract Builder<K, V> setKeyTypeDescriptor(TypeDescriptor<K> keyTypeDescriptor);
-      abstract Builder<K, V> setValueTypeDescriptor(TypeDescriptor<V> valueTypeDescriptor);
-      abstract Builder<K, V> setInputFormatClass(TypeDescriptor<?> inputFormatClass);
-      abstract Builder<K, V> setInputFormatKeyClass(TypeDescriptor<?> inputFormatKeyClass);
-      abstract Builder<K, V> setInputFormatValueClass(TypeDescriptor<?> inputFormatValueClass);
-      abstract Read<K, V> build();
-    }
-
-    /**
-     * Returns a new {@link HadoopInputFormatIO.Read} that will read from the source using the
-     * options provided by the given configuration.
-     *
-     * <p>Does not modify this object.
-     */
-    public Read<K, V> withConfiguration(Configuration configuration) {
-      validateConfiguration(configuration);
-      TypeDescriptor<?> inputFormatClass =
-          TypeDescriptor.of(configuration.getClass("mapreduce.job.inputformat.class", null));
-      TypeDescriptor<?> inputFormatKeyClass =
-          TypeDescriptor.of(configuration.getClass("key.class", null));
-      TypeDescriptor<?> inputFormatValueClass =
-          TypeDescriptor.of(configuration.getClass("value.class", null));
-      Builder<K, V> builder =
-          toBuilder().setConfiguration(new SerializableConfiguration(configuration));
-      builder.setInputFormatClass(inputFormatClass);
-      builder.setInputFormatKeyClass(inputFormatKeyClass);
-      builder.setInputFormatValueClass(inputFormatValueClass);
-      /*
-       * Sets the output key class to InputFormat key class if withKeyTranslation() is not called
-       * yet.
-       */
-      if (getKeyTranslationFunction() == null) {
-        builder.setKeyTypeDescriptor((TypeDescriptor<K>) inputFormatKeyClass);
-      }
-      /*
-       * Sets the output value class to InputFormat value class if withValueTranslation() is not
-       * called yet.
-       */
-      if (getValueTranslationFunction() == null) {
-        builder.setValueTypeDescriptor((TypeDescriptor<V>) inputFormatValueClass);
-      }
-      return builder.build();
-    }
-
-    /**
-     * Returns a new {@link HadoopInputFormatIO.Read} that will transform the keys read from the
-     * source using the given key translation function.
-     *
-     * <p>Does not modify this object.
-     */
-    public Read<K, V> withKeyTranslation(SimpleFunction<?, K> function) {
-      checkNotNull(function, "function");
-      // Sets key class to key translation function's output class type.
-      return toBuilder().setKeyTranslationFunction(function)
-          .setKeyTypeDescriptor((TypeDescriptor<K>) function.getOutputTypeDescriptor()).build();
-    }
-
-    /**
-     * Returns a new {@link HadoopInputFormatIO.Read} that will transform the values read from the
-     * source using the given value translation function.
-     *
-     * <p>Does not modify this object.
-     */
-    public Read<K, V> withValueTranslation(SimpleFunction<?, V> function) {
-      checkNotNull(function, "function");
-      // Sets value class to value translation function's output class type.
-      return toBuilder().setValueTranslationFunction(function)
-          .setValueTypeDescriptor((TypeDescriptor<V>) function.getOutputTypeDescriptor()).build();
-    }
-
-    @Override
-    public PCollection<KV<K, V>> expand(PBegin input) {
-      // Get the key and value coders based on the key and value classes.
-      CoderRegistry coderRegistry = input.getPipeline().getCoderRegistry();
-      Coder<K> keyCoder = getDefaultCoder(getKeyTypeDescriptor(), coderRegistry);
-      Coder<V> valueCoder = getDefaultCoder(getValueTypeDescriptor(), coderRegistry);
-      HadoopInputFormatBoundedSource<K, V> source = new HadoopInputFormatBoundedSource<K, V>(
-          getConfiguration(),
-          keyCoder,
-          valueCoder,
-          getKeyTranslationFunction(),
-          getValueTranslationFunction());
-      return input.getPipeline().apply(org.apache.beam.sdk.io.Read.from(source));
-    }
-
-    /**
-     * Validates that the mandatory configuration properties such as InputFormat class, InputFormat
-     * key and value classes are provided in the Hadoop configuration.
-     */
-    private void validateConfiguration(Configuration configuration) {
-      checkNotNull(configuration, "configuration");
-      checkNotNull(configuration.get("mapreduce.job.inputformat.class"),
-          "configuration.get(\"mapreduce.job.inputformat.class\")");
-      checkNotNull(configuration.get("key.class"), "configuration.get(\"key.class\")");
-      checkNotNull(configuration.get("value.class"),
-          "configuration.get(\"value.class\")");
-    }
-
-    /**
-     * Validates inputs provided by the pipeline user before reading the data.
-     */
-    @Override
-    public void validate(PBegin input) {
-      checkNotNull(getConfiguration(), "getConfiguration()");
-      // Validate that the key translation input type must be same as key class of InputFormat.
-      validateTranslationFunction(getinputFormatKeyClass(), getKeyTranslationFunction(),
-          "Key translation's input type is not same as hadoop InputFormat : %s key class : %s");
-      // Validate that the value translation input type must be same as value class of InputFormat.
-      validateTranslationFunction(getinputFormatValueClass(), getValueTranslationFunction(),
-          "Value translation's input type is not same as hadoop InputFormat :  "
-              + "%s value class : %s");
-    }
-
-    /**
-     * Validates translation function given for key/value translation.
-     */
-    private void validateTranslationFunction(TypeDescriptor<?> inputType,
-        SimpleFunction<?, ?> simpleFunction, String errorMsg) {
-      if (simpleFunction != null) {
-        if (!simpleFunction.getInputTypeDescriptor().equals(inputType)) {
-          throw new IllegalArgumentException(
-              String.format(errorMsg, getinputFormatClass().getRawType(), inputType.getRawType()));
-        }
-      }
-    }
-
-    /**
-     * Returns the default coder for a given type descriptor. Coder Registry is queried for correct
-     * coder, if not found in Coder Registry, then check if the type descriptor provided is of type
-     * Writable, then WritableCoder is returned, else exception is thrown "Cannot find coder".
-     */
-    @VisibleForTesting
-    public <T> Coder<T> getDefaultCoder(TypeDescriptor<?> typeDesc, CoderRegistry coderRegistry) {
-      Class classType = typeDesc.getRawType();
-      try {
-        return (Coder<T>) coderRegistry.getCoder(typeDesc);
-      } catch (CannotProvideCoderException e) {
-        if (Writable.class.isAssignableFrom(classType)) {
-          return (Coder<T>) WritableCoder.of(classType);
-        }
-        throw new IllegalStateException(String.format("Cannot find coder for %s  : ", typeDesc)
-            + e.getMessage(), e);
-      }
-    }
-
-    @Override
-    public void populateDisplayData(DisplayData.Builder builder) {
-      super.populateDisplayData(builder);
-      if (getConfiguration().getHadoopConfiguration() != null) {
-        Iterator<Entry<String, String>> configProperties = getConfiguration()
-            .getHadoopConfiguration().iterator();
-        while (configProperties.hasNext()) {
-          Entry<String, String> property = configProperties.next();
-          builder.addIfNotNull(DisplayData.item(property.getKey(), property.getValue())
-              .withLabel(property.getKey()));
-        }
-      }
-    }
-  }
-
-  /**
-   * Bounded source implementation for {@link HadoopInputFormatIO}.
-   * @param <K> Type of keys to be read.
-   * @param <V> Type of values to be read.
-   */
-  public static class HadoopInputFormatBoundedSource<K, V> extends BoundedSource<KV<K, V>>
-      implements Serializable {
-    private final SerializableConfiguration conf;
-    private final Coder<K> keyCoder;
-    private final Coder<V> valueCoder;
-    @Nullable private final SimpleFunction<?, K> keyTranslationFunction;
-    @Nullable private final SimpleFunction<?, V> valueTranslationFunction;
-    private final SerializableSplit inputSplit;
-    private transient List<SerializableSplit> inputSplits;
-    private long boundedSourceEstimatedSize = 0;
-    private transient InputFormat<?, ?> inputFormatObj;
-    private transient TaskAttemptContext taskAttemptContext;
-    HadoopInputFormatBoundedSource(
-        SerializableConfiguration conf,
-        Coder<K> keyCoder,
-        Coder<V> valueCoder,
-        @Nullable SimpleFunction<?, K> keyTranslationFunction,
-        @Nullable SimpleFunction<?, V> valueTranslationFunction) {
-      this(conf,
-          keyCoder,
-          valueCoder,
-          keyTranslationFunction,
-          valueTranslationFunction,
-          null);
-    }
-
-    protected HadoopInputFormatBoundedSource(
-        SerializableConfiguration conf,
-        Coder<K> keyCoder,
-        Coder<V> valueCoder,
-        @Nullable SimpleFunction<?, K> keyTranslationFunction,
-        @Nullable SimpleFunction<?, V> valueTranslationFunction,
-        SerializableSplit inputSplit) {
-      this.conf = conf;
-      this.inputSplit = inputSplit;
-      this.keyCoder = keyCoder;
-      this.valueCoder = valueCoder;
-      this.keyTranslationFunction = keyTranslationFunction;
-      this.valueTranslationFunction = valueTranslationFunction;
-    }
-
-    public SerializableConfiguration getConfiguration() {
-      return conf;
-    }
-
-    @Override
-    public void validate() {
-      checkNotNull(conf, "conf");
-      checkNotNull(keyCoder, "keyCoder");
-      checkNotNull(valueCoder, "valueCoder");
-    }
-
-    @Override
-    public List<BoundedSource<KV<K, V>>> splitIntoBundles(long desiredBundleSizeBytes,
-        PipelineOptions options) throws Exception {
-      // desiredBundleSizeBytes is not being considered as splitting based on this
-      // value is not supported by inputFormat getSplits() method.
-      if (inputSplit != null) {
-        LOG.info("Not splitting source {} because source is already split.", this);
-        return ImmutableList.of((BoundedSource<KV<K, V>>) this);
-      }
-      computeSplitsIfNecessary();
-      LOG.info("Generated {} splits. Size of first split is {} ", inputSplits.size(), inputSplits
-          .get(0).getSplit().getLength());
-      return Lists.transform(inputSplits,
-          new Function<SerializableSplit, BoundedSource<KV<K, V>>>() {
-            @Override
-            public BoundedSource<KV<K, V>> apply(SerializableSplit serializableInputSplit) {
-              HadoopInputFormatBoundedSource<K, V> hifBoundedSource =
-                  new HadoopInputFormatBoundedSource<K, V>(conf, keyCoder, valueCoder,
-                      keyTranslationFunction, valueTranslationFunction, serializableInputSplit);
-              return hifBoundedSource;
-            }
-          });
-    }
-
-    @Override
-    public long getEstimatedSizeBytes(PipelineOptions po) throws Exception {
-      if (inputSplit == null) {
-        // If there are no splits computed yet, then retrieve the splits.
-        computeSplitsIfNecessary();
-        return boundedSourceEstimatedSize;
-      }
-      return inputSplit.getSplit().getLength();
-    }
-
-    /**
-     * This is a helper function to compute splits. This method will also calculate size of the
-     * data being read. Note: This method is executed exactly once and the splits are retrieved
-     * and cached in this. These splits are further used by splitIntoBundles() and
-     * getEstimatedSizeBytes().
-     */
-    @VisibleForTesting
-    void computeSplitsIfNecessary() throws IOException, InterruptedException {
-      if (inputSplits != null) {
-        return;
-      }
-      createInputFormatInstance();
-      List<InputSplit> splits =
-          inputFormatObj.getSplits(Job.getInstance(conf.getHadoopConfiguration()));
-      if (splits == null) {
-        throw new IOException("Error in computing splits, getSplits() returns null.");
-      }
-      if (splits.isEmpty()) {
-        throw new IOException("Error in computing splits, getSplits() returns a empty list");
-      }
-      boundedSourceEstimatedSize = 0;
-      inputSplits = new ArrayList<SerializableSplit>();
-      for (InputSplit inputSplit : splits) {
-        if (inputSplit == null) {
-          throw new IOException("Error in computing splits, split is null in InputSplits list "
-              + "populated by getSplits() : ");
-        }
-        boundedSourceEstimatedSize += inputSplit.getLength();
-        inputSplits.add(new SerializableSplit(inputSplit));
-      }
-      validateUserInputForKeyAndValue();
-    }
-
-    /**
-     * Creates instance of InputFormat class. The InputFormat class name is specified in the Hadoop
-     * configuration.
-     */
-    protected void createInputFormatInstance() throws IOException {
-      if (inputFormatObj == null) {
-        try {
-          taskAttemptContext =
-              new TaskAttemptContextImpl(conf.getHadoopConfiguration(), new TaskAttemptID());
-          inputFormatObj =
-              (InputFormat<?, ?>) conf
-                  .getHadoopConfiguration()
-                  .getClassByName(
-                      conf.getHadoopConfiguration().get("mapreduce.job.inputformat.class"))
-                  .newInstance();
-          /*
-           * If InputFormat explicitly implements interface {@link Configurable}, then setConf()
-           * method of {@link Configurable} needs to be explicitly called to set all the
-           * configuration parameters. For example: InputFormat classes which implement Configurable
-           * are {@link org.apache.hadoop.mapreduce.lib.db.DBInputFormat DBInputFormat}, {@link
-           * org.apache.hadoop.hbase.mapreduce.TableInputFormat TableInputFormat}, etc.
-           */
-          if (Configurable.class.isAssignableFrom(inputFormatObj.getClass())) {
-            ((Configurable) inputFormatObj).setConf(conf.getHadoopConfiguration());
-          }
-        } catch (InstantiationException | IllegalAccessException | ClassNotFoundException e) {
-          throw new IOException("Unable to create InputFormat object: ", e);
-        }
-      }
-    }
-
-    /**
-     * Throws exception if you set different InputFormat key or value class than InputFormat's
-     * actual key or value class. If you set incorrect classes then, it may result in an error like
-     * "unexpected extra bytes after decoding" while the decoding process happens. Hence this
-     * validation is required.
-     */
-    private void validateUserInputForKeyAndValue() throws IOException, InterruptedException {
-      ParameterizedType genericClassType = determineGenericType();
-      RecordReader<?, ?> reader = fetchFirstRecordReader();
-      boolean isCorrectKeyClassSet =
-          validateClass(genericClassType.getActualTypeArguments()[0].getTypeName(), keyCoder,
-              reader.getCurrentKey(), "key.class");
-      boolean isCorrectValueClassSet =
-          validateClass(genericClassType.getActualTypeArguments()[1].getTypeName(), valueCoder,
-              reader.getCurrentValue(), "value.class");
-      if (!isCorrectKeyClassSet) {
-        Class<?> actualClass = conf.getHadoopConfiguration().getClass("key.class", Object.class);
-        throw new IllegalArgumentException(String.format(
-            "Wrong InputFormat key class in configuration : Expected key.class is %s but was %s.",
-            reader.getCurrentKey().getClass().getName(), actualClass.getName()));
-      }
-      if (!isCorrectValueClassSet) {
-        Class<?> actualClass = conf.getHadoopConfiguration().getClass("value.class", Object.class);
-        throw new IllegalArgumentException(String.format("Wrong InputFormat value class in "
-            + "configuration : Expected value.class is %s but was %s.", reader.getCurrentValue()
-            .getClass().getName(), actualClass.getName()));
-      }
-    }
-
-    /**
-     * Returns true if key/value class set by the user is compatible with the key/value class of a
-     * pair returned by RecordReader. User provided key/value class is validated against the
-     * parameterized type's type arguments of InputFormat. If parameterized type has any type
-     * arguments such as T, K, V, etc then validation is done by encoding and decoding key/value
-     * object of first pair returned by RecordReader.
-     */
-    private <T> boolean validateClass(String inputFormatGenericClassName, Coder coder,
-        Object object, String property) {
-      try {
-        Class<?> inputClass = Class.forName(inputFormatGenericClassName);
-        /*
-         * Validates key/value class with InputFormat's parameterized type.
-         */
-        if (property.equals("key.class")) {
-          return (conf.getHadoopConfiguration().getClass("key.class",
-              Object.class)).isAssignableFrom(inputClass);
-        }
-        return (conf.getHadoopConfiguration().getClass("value.class",
-            Object.class)).isAssignableFrom(inputClass);
-      } catch (ClassNotFoundException e) {
-        /*
-         * Given inputFormatGenericClassName is a type parameter i.e. T, K, V, etc. In such cases
-         * class validation for user provided input key/value will not work correctly. Therefore
-         * the need to validate key/value class by encoding and decoding key/value object with
-         * the given coder.
-         */
-        return checkEncodingAndDecoding((Coder<T>) coder, (T) object);
-      }
-    }
-
-    /**
-     * Validates whether the input gets encoded or decoded correctly using the provided coder.
-     */
-    private <T> boolean checkEncodingAndDecoding(Coder<T> coder, T input) {
-      try {
-        CoderUtils.clone(coder, input);
-      } catch (CoderException e) {
-        return false;
-      }
-      return true;
-    }
-
-    /**
-     * Returns parameterized type of the InputFormat class.
-     */
-    private ParameterizedType determineGenericType() {
-      // Any InputFormatClass always inherits from InputFormat<K, V> which is a ParameterizedType.
-      // Hence, we can fetch generic super class of inputFormatClass which is a ParameterizedType.
-      Class<?> inputFormatClass = inputFormatObj.getClass();
-      Type genericSuperclass = null;
-      for (;;) {
-        genericSuperclass = inputFormatClass.getGenericSuperclass();
-        if (genericSuperclass instanceof ParameterizedType) {
-          break;
-        }
-        inputFormatClass = inputFormatClass.getSuperclass();
-      }
-      return (ParameterizedType) genericSuperclass;
-    }
-
-    /**
-     * Returns RecordReader object of the first split to read first record for validating key/value
-     * classes.
-     */
-    private RecordReader fetchFirstRecordReader() throws IOException, InterruptedException {
-      RecordReader<?, ?> reader =
-          inputFormatObj.createRecordReader(inputSplits.get(0).getSplit(), taskAttemptContext);
-      if (reader == null) {
-        throw new IOException(String.format("Null RecordReader object returned by %s",
-            inputFormatObj.getClass()));
-      }
-      reader.initialize(inputSplits.get(0).getSplit(), taskAttemptContext);
-      // First record is read to get the InputFormat's key and value classes.
-      reader.nextKeyValue();
-      return reader;
-    }
-
-    @VisibleForTesting
-    InputFormat<?, ?> getInputFormat(){
-      return inputFormatObj;
-    }
-
-    @VisibleForTesting
-    void setInputFormatObj(InputFormat<?, ?> inputFormatObj) {
-      this.inputFormatObj = inputFormatObj;
-    }
-
-    @Override
-    public Coder<KV<K, V>> getDefaultOutputCoder() {
-      return KvCoder.of(keyCoder, valueCoder);
-    }
-
-    @Override
-    public BoundedReader<KV<K, V>> createReader(PipelineOptions options) throws IOException {
-      this.validate();
-      if (inputSplit == null) {
-        throw new IOException("Cannot create reader as source is not split yet.");
-      } else {
-        createInputFormatInstance();
-        return new HadoopInputFormatReader<>(
-            this,
-            keyTranslationFunction,
-            valueTranslationFunction,
-            inputSplit,
-            inputFormatObj,
-            taskAttemptContext);
-      }
-    }
-
-    /**
-     * BoundedReader for Hadoop InputFormat source.
-     *
-     * @param <K> Type of keys RecordReader emits.
-     * @param <V> Type of values RecordReader emits.
-     */
-    class HadoopInputFormatReader<T1, T2> extends BoundedSource.BoundedReader<KV<K, V>> {
-
-      private final HadoopInputFormatBoundedSource<K, V> source;
-      @Nullable private final SimpleFunction<T1, K> keyTranslationFunction;
-      @Nullable private final SimpleFunction<T2, V> valueTranslationFunction;
-      private final SerializableSplit split;
-      private RecordReader<T1, T2> recordReader;
-      private volatile boolean doneReading = false;
-      private volatile long recordsReturned = 0L;
-      // Tracks the progress of the RecordReader.
-      private AtomicDouble progressValue = new AtomicDouble();
-      private transient InputFormat<T1, T2> inputFormatObj;
-      private transient TaskAttemptContext taskAttemptContext;
-
-      private HadoopInputFormatReader(HadoopInputFormatBoundedSource<K, V> source,
-          @Nullable SimpleFunction keyTranslationFunction,
-          @Nullable SimpleFunction valueTranslationFunction,
-          SerializableSplit split,
-          InputFormat inputFormatObj,
-          TaskAttemptContext taskAttemptContext) {
-        this.source = source;
-        this.keyTranslationFunction = keyTranslationFunction;
-        this.valueTranslationFunction = valueTranslationFunction;
-        this.split = split;
-        this.inputFormatObj = inputFormatObj;
-        this.taskAttemptContext = taskAttemptContext;
-      }
-
-      @Override
-      public HadoopInputFormatBoundedSource<K, V> getCurrentSource() {
-        return source;
-      }
-
-      @Override
-      public boolean start() throws IOException {
-        try {
-          recordsReturned = 0;
-          recordReader =
-              (RecordReader<T1, T2>) inputFormatObj.createRecordReader(split.getSplit(),
-                  taskAttemptContext);
-          if (recordReader != null) {
-            recordReader.initialize(split.getSplit(), taskAttemptContext);
-            progressValue.set(getProgress());
-            if (recordReader.nextKeyValue()) {
-              recordsReturned++;
-              doneReading = false;
-              return true;
-            }
-          } else {
-            throw new IOException(String.format("Null RecordReader object returned by %s",
-                inputFormatObj.getClass()));
-          }
-          recordReader = null;
-        } catch (InterruptedException e) {
-          throw new IOException(
-              "Could not read because the thread got interrupted while "
-              + "reading the records with an exception: ",
-              e);
-        }
-        doneReading = true;
-        return false;
-      }
-
-      @Override
-      public boolean advance() throws IOException {
-        try {
-          progressValue.set(getProgress());
-          if (recordReader.nextKeyValue()) {
-            recordsReturned++;
-            return true;
-          }
-          doneReading = true;
-        } catch (InterruptedException e) {
-          throw new IOException("Unable to read data: ", e);
-        }
-        return false;
-      }
-
-      @Override
-      public KV<K, V> getCurrent() {
-        K key = null;
-        V value = null;
-        try {
-          // Transform key if translation function is provided.
-          key =
-              transformKeyOrValue((T1) recordReader.getCurrentKey(), keyTranslationFunction,
-                  keyCoder);
-          // Transform value if translation function is provided.
-          value =
-              transformKeyOrValue((T2) recordReader.getCurrentValue(), valueTranslationFunction,
-                  valueCoder);
-        } catch (IOException | InterruptedException e) {
-          LOG.error("Unable to read data: " + "{}", e);
-          throw new IllegalStateException("Unable to read data: " + "{}", e);
-        }
-        return KV.of(key, value);
-      }
-
-      /**
-       * Returns the serialized output of transformed key or value object.
-       * @throws ClassCastException
-       * @throws CoderException
-       */
-      private <T, T3> T3 transformKeyOrValue(T input,
-          @Nullable SimpleFunction<T, T3> simpleFunction, Coder<T3> coder) throws CoderException,
-          ClassCastException {
-        T3 output;
-        if (null != simpleFunction) {
-          output = simpleFunction.apply(input);
-        } else {
-          output = (T3) input;
-        }
-        return cloneIfPossiblyMutable((T3) output, coder);
-      }
-
-      /**
-       * Beam expects immutable objects, but the Hadoop InputFormats tend to re-use the same object
-       * when returning them. Hence, mutable objects returned by Hadoop InputFormats are cloned.
-       */
-      private <T> T cloneIfPossiblyMutable(T input, Coder<T> coder) throws CoderException,
-          ClassCastException {
-        // If the input object is not of known immutable type, clone the object.
-        if (!isKnownImmutable(input)) {
-          input = CoderUtils.clone(coder, input);
-        }
-        return input;
-      }
-
-      /**
-       * Utility method to check if the passed object is of a known immutable type.
-       */
-      private boolean isKnownImmutable(Object o) {
-        Set<Class<?>> immutableTypes = new HashSet<Class<?>>(
-            Arrays.asList(
-                String.class,
-                Byte.class,
-                Short.class,
-                Integer.class,
-                Long.class,
-                Float.class,
-                Double.class,
-                Boolean.class,
-                BigInteger.class,
-                BigDecimal.class));
-        return immutableTypes.contains(o.getClass());
-      }
-
-      @Override
-      public void close() throws IOException {
-        LOG.info("Closing reader after reading {} records.", recordsReturned);
-        if (recordReader != null) {
-          recordReader.close();
-          recordReader = null;
-        }
-      }
-
-      @Override
-      public Double getFractionConsumed() {
-        if (doneReading) {
-          progressValue.set(1.0);
-        } else if (recordReader == null || recordsReturned == 0) {
-          progressValue.set(0.0);
-        }
-        return progressValue.doubleValue();
-      }
-
-      /**
-       * Returns RecordReader's progress.
-       * @throws IOException
-       * @throws InterruptedException
-       */
-      private Double getProgress() throws IOException, InterruptedException {
-        try {
-          return (double) recordReader.getProgress();
-        } catch (IOException e) {
-          LOG.error(
-              "Error in computing the fractions consumed as RecordReader.getProgress() throws an "
-              + "exception : " + "{}", e);
-          throw new IOException(
-              "Error in computing the fractions consumed as RecordReader.getProgress() throws an "
-              + "exception : " + e.getMessage(), e);
-        }
-      }
-
-      @Override
-      public final long getSplitPointsRemaining() {
-        if (doneReading) {
-          return 0;
-        }
-        /**
-         * This source does not currently support dynamic work rebalancing, so remaining parallelism
-         * is always 1.
-         */
-        return 1;
-      }
-    }
-  }
-
-  /**
-   * A wrapper to allow Hadoop {@link org.apache.hadoop.mapreduce.InputSplit} to be serialized using
-   * Java's standard serialization mechanisms.
-   */
-  public static class SerializableSplit implements Serializable {
-
-    InputSplit inputSplit;
-
-    public SerializableSplit() {}
-
-    public SerializableSplit(InputSplit split) {
-      checkArgument(split instanceof Writable,
-          String.format("Split is not of type Writable: %s", split));
-      this.inputSplit = split;
-    }
-
-    public InputSplit getSplit() {
-      return inputSplit;
-    }
-
-    private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
-      ObjectWritable ow = new ObjectWritable();
-      ow.setConf(new Configuration(false));
-      ow.readFields(in);
-      this.inputSplit = (InputSplit) ow.get();
-    }
-
-    private void writeObject(ObjectOutputStream out) throws IOException {
-      new ObjectWritable(inputSplit).write(out);
-    }
-  }
-
-  /**
-   * A wrapper to allow Hadoop {@link org.apache.hadoop.conf.Configuration} to be serialized using
-   * Java's standard serialization mechanisms. Note that the org.apache.hadoop.conf.Configuration
-   * is Writable.
-   */
-  public static class SerializableConfiguration implements Externalizable {
-
-    private Configuration conf;
-
-    public SerializableConfiguration() {}
-
-    public SerializableConfiguration(Configuration conf) {
-      this.conf = conf;
-    }
-
-    public Configuration getHadoopConfiguration() {
-      return conf;
-    }
-
-    @Override
-    public void writeExternal(ObjectOutput out) throws IOException {
-      out.writeUTF(conf.getClass().getCanonicalName());
-      ((Writable) conf).write(out);
-    }
-
-    @Override
-    public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException {
-      String className = in.readUTF();
-      try {
-        conf = (Configuration) Class.forName(className).newInstance();
-        conf.readFields(in);
-      } catch (InstantiationException | IllegalAccessException e) {
-        throw new IOException("Unable to create configuration: " + e);
-      }
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop-input-format/src/main/java/org/apache/beam/sdk/io/hadoop/inputformat/package-info.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop-input-format/src/main/java/org/apache/beam/sdk/io/hadoop/inputformat/package-info.java b/sdks/java/io/hadoop-input-format/src/main/java/org/apache/beam/sdk/io/hadoop/inputformat/package-info.java
deleted file mode 100644
index 5488448..0000000
--- a/sdks/java/io/hadoop-input-format/src/main/java/org/apache/beam/sdk/io/hadoop/inputformat/package-info.java
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * Defines transforms for reading from Data sources which implement Hadoop Input Format.
- *
- * @see org.apache.beam.sdk.io.hadoop.inputformat.HadoopInputFormatIO
- */
-package org.apache.beam.sdk.io.hadoop.inputformat;

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop-input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/ConfigurableEmployeeInputFormat.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop-input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/ConfigurableEmployeeInputFormat.java b/sdks/java/io/hadoop-input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/ConfigurableEmployeeInputFormat.java
deleted file mode 100644
index 40f949b..0000000
--- a/sdks/java/io/hadoop-input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/ConfigurableEmployeeInputFormat.java
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
- * agreements. See the NOTICE file distributed with this work for additional information regarding
- * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance with the License. You may obtain a
- * copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software distributed under the License
- * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
- * or implied. See the License for the specific language governing permissions and limitations under
- * the License.
- */
-package org.apache.beam.sdk.io.hadoop.inputformat;
-
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.hadoop.conf.Configurable;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.mapreduce.InputFormat;
-import org.apache.hadoop.mapreduce.InputSplit;
-import org.apache.hadoop.mapreduce.JobContext;
-import org.apache.hadoop.mapreduce.RecordReader;
-import org.apache.hadoop.mapreduce.TaskAttemptContext;
-
-/**
- * This is a dummy input format to test reading using HadoopInputFormatIO if InputFormat implements
- * Configurable. This validates if setConf() method is called before getSplits(). Known InputFormats
- * which implement Configurable are DBInputFormat, TableInputFormat etc.
- */
-public class ConfigurableEmployeeInputFormat extends InputFormat<Text, Employee> implements
-    Configurable {
-  public boolean isConfSet = false;
-
-  public ConfigurableEmployeeInputFormat() {}
-
-  @Override
-  public Configuration getConf() {
-    return null;
-  }
-
-  /**
-   * Set configuration properties such as number of splits and number of records in each split.
-   */
-  @Override
-  public void setConf(Configuration conf) {
-    isConfSet = true;
-  }
-
-  @Override
-  public RecordReader<Text, Employee> createRecordReader(InputSplit split,
-      TaskAttemptContext context) throws IOException, InterruptedException {
-    return new ConfigurableEmployeeRecordReader();
-  }
-
-  /**
-   * Returns InputSPlit list of {@link ConfigurableEmployeeInputSplit}. Throws exception if
-   * {@link #setConf()} is not called.
-   */
-  @Override
-  public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
-    if (!isConfSet) {
-      throw new IOException("Configuration is not set.");
-    }
-    List<InputSplit> splits = new ArrayList<InputSplit>();
-    splits.add(new ConfigurableEmployeeInputSplit());
-    return splits;
-  }
-
-  /**
-   * InputSplit implementation for ConfigurableEmployeeInputFormat.
-   */
-  public class ConfigurableEmployeeInputSplit extends InputSplit implements Writable {
-
-    @Override
-    public void readFields(DataInput arg0) throws IOException {}
-
-    @Override
-    public void write(DataOutput arg0) throws IOException {}
-
-    @Override
-    public long getLength() throws IOException, InterruptedException {
-      return 0;
-    }
-
-    @Override
-    public String[] getLocations() throws IOException, InterruptedException {
-      return null;
-    }
-  }
-
-  /**
-   * RecordReader for ConfigurableEmployeeInputFormat.
-   */
-  public class ConfigurableEmployeeRecordReader extends RecordReader<Text, Employee> {
-
-    @Override
-    public void initialize(InputSplit paramInputSplit, TaskAttemptContext paramTaskAttemptContext)
-        throws IOException, InterruptedException {}
-
-    @Override
-    public boolean nextKeyValue() throws IOException, InterruptedException {
-      return false;
-    }
-
-    @Override
-    public Text getCurrentKey() throws IOException, InterruptedException {
-      return null;
-    }
-
-    @Override
-    public Employee getCurrentValue() throws IOException, InterruptedException {
-      return null;
-    }
-
-    @Override
-    public float getProgress() throws IOException, InterruptedException {
-      return 0;
-    }
-
-    @Override
-    public void close() throws IOException {}
-  }
-}

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop-input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/Employee.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop-input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/Employee.java b/sdks/java/io/hadoop-input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/Employee.java
deleted file mode 100644
index 9d4f293..0000000
--- a/sdks/java/io/hadoop-input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/Employee.java
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
- * agreements. See the NOTICE file distributed with this work for additional information regarding
- * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance with the License. You may obtain a
- * copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software distributed under the License
- * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
- * or implied. See the License for the specific language governing permissions and limitations under
- * the License.
- */
-package org.apache.beam.sdk.io.hadoop.inputformat;
-
-import org.apache.beam.sdk.coders.AvroCoder;
-import org.apache.beam.sdk.coders.DefaultCoder;
-
-/**
- * This class is Employee POJO class with properties- employee name and address. Used in
- * {@linkplain HadoopInputFormatIO} for different unit tests.
- */
-@DefaultCoder(AvroCoder.class)
-public class Employee {
-  private String empAddress;
-  private String empName;
-
-  /**
-   * Empty constructor required for Avro decoding.
-   */
-  public Employee() {}
-
-  public Employee(String empName, String empAddress) {
-    this.empAddress = empAddress;
-    this.empName = empName;
-  }
-
-  public String getEmpName() {
-    return empName;
-  }
-
-  public void setEmpName(String empName) {
-    this.empName = empName;
-  }
-
-  public String getEmpAddress() {
-    return empAddress;
-  }
-
-  public void setEmpAddress(String empAddress) {
-    this.empAddress = empAddress;
-  }
-
-  @Override
-  public boolean equals(Object o) {
-    if (this == o) {
-      return true;
-    }
-    if (o == null || getClass() != o.getClass()) {
-      return false;
-    }
-
-    Employee employeePojo = (Employee) o;
-
-    if (empName != null ? !empName.equals(employeePojo.empName) : employeePojo.empName != null) {
-      return false;
-    }
-    if (empAddress != null ? !empAddress.equals(employeePojo.empAddress)
-        : employeePojo.empAddress != null) {
-      return false;
-    }
-    return true;
-  }
-
-  @Override
-  public int hashCode() {
-    return 0;
-  }
-
-  @Override
-  public String toString() {
-    return "Employee{" + "Name='" + empName + '\'' + ", Address=" + empAddress + '}';
-  }
-}

http://git-wip-us.apache.org/repos/asf/beam/blob/174436bc/sdks/java/io/hadoop-input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/EmployeeInputFormat.java
----------------------------------------------------------------------
diff --git a/sdks/java/io/hadoop-input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/EmployeeInputFormat.java b/sdks/java/io/hadoop-input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/EmployeeInputFormat.java
deleted file mode 100644
index 206f9ab..0000000
--- a/sdks/java/io/hadoop-input-format/src/test/java/org/apache/beam/sdk/io/hadoop/inputformat/EmployeeInputFormat.java
+++ /dev/null
@@ -1,172 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
- * agreements. See the NOTICE file distributed with this work for additional information regarding
- * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance with the License. You may obtain a
- * copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software distributed under the License
- * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
- * or implied. See the License for the specific language governing permissions and limitations under
- * the License.
- */
-package org.apache.beam.sdk.io.hadoop.inputformat;
-
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.beam.sdk.values.KV;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.mapreduce.InputFormat;
-import org.apache.hadoop.mapreduce.InputSplit;
-import org.apache.hadoop.mapreduce.JobContext;
-import org.apache.hadoop.mapreduce.RecordReader;
-import org.apache.hadoop.mapreduce.TaskAttemptContext;
-
-/**
- * This is a valid InputFormat for reading employee data, available in the form of {@code List<KV>}
- * as {@linkplain EmployeeRecordReader#employeeDataList employeeDataList} .
- * {@linkplain EmployeeRecordReader#employeeDataList employeeDataList} is populated using
- * {@linkplain TestEmployeeDataSet#populateEmployeeData()}.
- * {@linkplain EmployeeInputFormat} is used to test whether the
- * {@linkplain HadoopInputFormatIO } source returns immutable records in the scenario when
- * RecordReader creates new key and value objects every time it reads data.
- */
-public class EmployeeInputFormat extends InputFormat<Text, Employee> {
-
-  public EmployeeInputFormat() {}
-
-  @Override
-  public RecordReader<Text, Employee> createRecordReader(InputSplit split,
-      TaskAttemptContext context) throws IOException, InterruptedException {
-    return new EmployeeRecordReader();
-  }
-
-  @Override
-  public List<InputSplit> getSplits(JobContext arg0) throws IOException, InterruptedException {
-    List<InputSplit> inputSplitList = new ArrayList<InputSplit>();
-    for (int i = 1; i <= TestEmployeeDataSet.NUMBER_OF_SPLITS; i++) {
-      InputSplit inputSplitObj =
-          new NewObjectsEmployeeInputSplit(
-              ((i - 1) * TestEmployeeDataSet.NUMBER_OF_RECORDS_IN_EACH_SPLIT), (i
-                  * TestEmployeeDataSet.NUMBER_OF_RECORDS_IN_EACH_SPLIT - 1));
-      inputSplitList.add(inputSplitObj);
-    }
-    return inputSplitList;
-  }
-
-  /**
-   * InputSplit implementation for EmployeeInputFormat.
-   */
-  public static class NewObjectsEmployeeInputSplit extends InputSplit implements Writable {
-    // Start and end map index of each split of employeeData.
-    private long startIndex;
-    private long endIndex;
-
-    public NewObjectsEmployeeInputSplit() {}
-
-    public NewObjectsEmployeeInputSplit(long startIndex, long endIndex) {
-      this.startIndex = startIndex;
-      this.endIndex = endIndex;
-    }
-
-    /**
-     * Returns number of records in each split.
-     */
-    @Override
-    public long getLength() throws IOException, InterruptedException {
-      return this.endIndex - this.startIndex + 1;
-    }
-
-    @Override
-    public String[] getLocations() throws IOException, InterruptedException {
-      return null;
-    }
-
-    public long getStartIndex() {
-      return startIndex;
-    }
-
-    public long getEndIndex() {
-      return endIndex;
-    }
-
-    @Override
-    public void readFields(DataInput dataIn) throws IOException {
-      startIndex = dataIn.readLong();
-      endIndex = dataIn.readLong();
-    }
-
-    @Override
-    public void write(DataOutput dataOut) throws IOException {
-      dataOut.writeLong(startIndex);
-      dataOut.writeLong(endIndex);
-    }
-  }
-
-  /**
-   * RecordReader for EmployeeInputFormat.
-   */
-  public class EmployeeRecordReader extends RecordReader<Text, Employee> {
-
-    private NewObjectsEmployeeInputSplit split;
-    private Text currentKey;
-    private Employee currentValue;
-    private long employeeListIndex = 0L;
-    private long recordsRead = 0L;
-    private List<KV<String, String>> employeeDataList;
-
-    public EmployeeRecordReader() {}
-
-    @Override
-    public void close() throws IOException {}
-
-    @Override
-    public Text getCurrentKey() throws IOException, InterruptedException {
-      return currentKey;
-    }
-
-    @Override
-    public Employee getCurrentValue() throws IOException, InterruptedException {
-      return currentValue;
-    }
-
-    @Override
-    public float getProgress() throws IOException, InterruptedException {
-      return (float) recordsRead / split.getLength();
-    }
-
-    @Override
-    public void initialize(InputSplit split, TaskAttemptContext arg1) throws IOException,
-        InterruptedException {
-      this.split = (NewObjectsEmployeeInputSplit) split;
-      employeeListIndex = this.split.getStartIndex() - 1;
-      recordsRead = 0;
-      employeeDataList = TestEmployeeDataSet.populateEmployeeData();
-      currentValue = new Employee(null, null);
-    }
-
-    @Override
-    public boolean nextKeyValue() throws IOException, InterruptedException {
-      if ((recordsRead++) >= split.getLength()) {
-        return false;
-      }
-      employeeListIndex++;
-      KV<String, String> employeeDetails = employeeDataList.get((int) employeeListIndex);
-      String empData[] = employeeDetails.getValue().split("_");
-      /*
-       * New objects must be returned every time for key and value in order to test the scenario as
-       * discussed the in the class' javadoc.
-       */
-      currentKey = new Text(employeeDetails.getKey());
-      currentValue = new Employee(empData[0], empData[1]);
-      return true;
-    }
-  }
-}