You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@samza.apache.org by ja...@apache.org on 2018/12/07 03:28:17 UTC

[17/18] samza-hello-samza git commit: Merge latest with master

Merge latest with master


Project: http://git-wip-us.apache.org/repos/asf/samza-hello-samza/repo
Commit: http://git-wip-us.apache.org/repos/asf/samza-hello-samza/commit/82905901
Tree: http://git-wip-us.apache.org/repos/asf/samza-hello-samza/tree/82905901
Diff: http://git-wip-us.apache.org/repos/asf/samza-hello-samza/diff/82905901

Branch: refs/heads/master
Commit: 829059019218ab96a193c242e27803ccdbbf8c3f
Parents: 428f613 e971e93
Author: Jagadish <jv...@linkedin.com>
Authored: Thu Dec 6 19:17:27 2018 -0800
Committer: Jagadish <jv...@linkedin.com>
Committed: Thu Dec 6 19:17:27 2018 -0800

----------------------------------------------------------------------
 README.md                                       |  95 +-
 bin/deploy.sh                                   |   2 +-
 bin/run-azure-application.sh                    |  30 -
 bin/run-event-hubs-zk-application.sh            |  30 +
 build.gradle                                    |  18 +-
 gradle.properties                               |   4 +-
 pom.xml                                         |  74 +-
 quickstart/wordcount.tar.gz                     | Bin 0 -> 67169 bytes
 src/main/assembly/src.xml                       |   4 +
 .../azure-application-local-runner.properties   |  26 +-
 src/main/config/filter-example.properties       |  25 +
 src/main/config/join-example.properties         |  25 +
 .../config/pageview-adclick-joiner.properties   |  36 -
 src/main/config/pageview-filter.properties      |  36 -
 src/main/config/pageview-sessionizer.properties |  36 -
 .../config/remote-table-join-example.properties |  25 +
 .../config/session-window-example.properties    |  24 +
 .../config/stream-table-join-example.properties |  25 +
 .../config/tumbling-pageview-counter.properties |  36 -
 .../config/tumbling-window-example.properties   |  25 +
 ...ikipedia-application-local-runner.properties |  23 -
 .../config/wikipedia-application.properties     |  24 -
 src/main/config/wikipedia-feed.properties       |  24 +-
 src/main/config/wikipedia-parser.properties     |  22 +-
 src/main/config/wikipedia-stats.properties      |  23 +-
 .../samza/examples/azure/AzureApplication.java  |  58 +-
 .../examples/azure/AzureZKLocalApplication.java |   6 +-
 .../samza/examples/cookbook/FilterExample.java  | 101 +++
 .../samza/examples/cookbook/JoinExample.java    | 163 ++++
 .../cookbook/PageViewAdClickJoiner.java         | 137 ---
 .../examples/cookbook/PageViewFilterApp.java    |  79 --
 .../cookbook/PageViewSessionizerApp.java        | 100 ---
 .../cookbook/RemoteTableJoinExample.java        | 200 +++++
 .../examples/cookbook/SessionWindowExample.java | 120 +++
 .../cookbook/StreamTableJoinExample.java        | 174 ++++
 .../cookbook/TumblingPageViewCounterApp.java    |  98 ---
 .../cookbook/TumblingWindowExample.java         | 121 +++
 .../samza/examples/cookbook/data/AdClick.java   |  11 +
 .../application/WikipediaApplication.java       | 143 ++-
 .../WikipediaZkLocalApplication.java            |   6 +-
 .../wikipedia/model/WikipediaParser.java        |   1 +
 .../descriptors/WikipediaInputDescriptor.java   |  42 +
 .../descriptors/WikipediaSystemDescriptor.java  |  53 ++
 .../task/WikipediaStatsStreamTask.java          |  11 +-
 .../WikipediaFeedTaskApplication.java           |  87 ++
 .../WikipediaParserTaskApplication.java         |  72 ++
 .../WikipediaStatsTaskApplication.java          |  68 ++
 .../test/TestSamzaCookBookExamples.java         | 179 ++++
 .../samza/examples/test/utils/TestUtils.java    |  93 ++
 .../test/TestWikipediaApplication.java          |  82 ++
 .../wikipedia/task/test/TestWikipediaTask.java  |  70 ++
 src/test/resources/WikinewsEditEvents.txt       | 104 +++
 src/test/resources/WikipediaEditEvents.txt      | 882 +++++++++++++++++++
 src/test/resources/WikitionaryEditEvents.txt    |   9 +
 54 files changed, 3157 insertions(+), 805 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/samza-hello-samza/blob/82905901/README.md
----------------------------------------------------------------------
diff --cc README.md
index dd676dc,81c7624..975a2c6
--- a/README.md
+++ b/README.md
@@@ -5,85 -5,99 +5,176 @@@ hello-samz
  
  ### About
  
- [Hello Samza](http://samza.apache.org/startup/hello-samza/0.13/) is developed as part of the [Apache Samza](http://samza.apache.org) project. Please direct questions, improvements and bug fixes there. Questions about [Hello Samza](http://samza.apache.org/startup/hello-samza/0.13/) are welcome on the [dev list](http://samza.apache.org/community/mailing-lists.html) and the [Samza JIRA](https://issues.apache.org/jira/browse/SAMZA) has a hello-samza component for filing tickets.
+ [Hello Samza](http://samza.apache.org/startup/hello-samza/latest/) is developed as part of the [Apache Samza](http://samza.apache.org) project. Please direct questions, improvements and bug fixes there. Questions about [Hello Samza](http://samza.apache.org/startup/hello-samza/latest/) are welcome on the [dev list](http://samza.apache.org/community/mailing-lists.html) and the [Samza JIRA](https://issues.apache.org/jira/browse/SAMZA) has a hello-samza component for filing tickets.
+ 
+ ### Instructions
+ 
+ The **Hello Samza** project contains example Samza applications of high-level API as well as low-level API. The following are the instructions to install the binaries and run the applications in a local Yarn cluster. See also [Hello Samza](http://samza.apache.org/startup/hello-samza/latest/) and [Hello Samza High Level API](http://samza.apache.org/learn/tutorials/latest/hello-samza-high-level-yarn.html) for more information.
+ 
+ #### 1. Get the Code
+ 
+ Check out the hello-samza project:
+ 
+ ```
+ git clone https://git.apache.org/samza-hello-samza.git hello-samza
+ cd hello-samza
+ ```
+ 
+ To build hello-samza with the latest Samza master, you can switch to the _latest_ branch:
+ 
+ ```
+ git checkout latest
+ ```
+ 
+ This project contains everything you'll need to run your first Samza application.
+ 
+ #### 2. Start a Grid
+ 
+ A Samza grid usually comprises three different systems: [YARN](http://hadoop.apache.org/docs/current/hadoop-yarn/hadoop-yarn-site/YARN.html), [Kafka](http://kafka.apache.org/), and [ZooKeeper](http://zookeeper.apache.org/). The hello-samza project comes with a script called "grid" to help you setup these systems. Start by running:
+ 
+ ```
+ ./bin/grid bootstrap
+ ```
+ 
+ This command will download, install, and start ZooKeeper, Kafka, and YARN. It will also check out the latest version of Samza and build it. All package files will be put in a sub-directory called "deploy" inside hello-samza's root folder.
+ 
+ If you get a complaint that _JAVA_HOME_ is not set, then you'll need to set it to the path where Java is installed on your system.
+ 
+ Once the grid command completes, you can verify that YARN is up and running by going to [http://localhost:8088](http://localhost:8088). This is the YARN UI.
+ 
+ #### 3. Build a Samza Application Package
+ 
+ Before you can run a Samza application, you need to build a package for it. This package is what YARN uses to deploy your apps on the grid. Use the following command in hello-samza project to build and deploy the example applications:
+ 
+ ```
+ ./bin/deploy.sh
+ ```
+ 
+ #### 4. Run a Samza Application
+ 
+ After you've built your Samza package, you can start the example applications on the grid.
+ 
+ ##### - High-level API Examples
+ 
+ Package [samza.examples.cookbook](https://github.com/apache/samza-hello-samza/tree/master/src/main/java/samza/examples/cookbook) contains various examples of high-level API operator usage, such as map, partitionBy, window and join. Each example is a runnable Samza application with the steps in the class javadocs, e.g [PageViewAdClickJoiner](https://github.com/apache/samza-hello-samza/blob/master/src/main/java/samza/examples/cookbook/PageViewAdClickJoiner.java).
+ 
+ Package [samza.examples.wikipedia.application](https://github.com/apache/samza-hello-samza/tree/master/src/main/java/samza/examples/wikipedia/application) contains a small Samza application which consumes the real-time feeds from Wikipedia, extracts the metadata of the events, and calculates statistics of all edits in a 10-second window. You can start the app on the grid using the run-app.sh script:
+ 
+ ```
+ ./deploy/samza/bin/run-app.sh --config-factory=org.apache.samza.config.factories.PropertiesConfigFactory --config-path=file://$PWD/deploy/samza/config/wikipedia-application.properties
+ ```
+ 
+ Once the job is started, we can tail the kafka topic by:
+ 
+ ```
+ ./deploy/kafka/bin/kafka-console-consumer.sh  --zookeeper localhost:2181 --topic wikipedia-stats
+ ```
+ 
+ A code walkthrough of this application can be found [here](http://samza.apache.org/learn/tutorials/latest/hello-samza-high-level-code.html).
+ 
+ ##### - Low-level API Examples
+ 
+ Package [samza.examples.wikipedia.task](https://github.com/apache/samza-hello-samza/tree/master/src/main/java/samza/examples/wikipedia/task) contains the low-level API Samza code for the Wikipedia example. To run it, use the following scripts:
+ 
+ ```
+ deploy/samza/bin/run-app.sh --config-factory=org.apache.samza.config.factories.PropertiesConfigFactory --config-path=file://$PWD/deploy/samza/config/wikipedia-feed.properties
+ deploy/samza/bin/run-app.sh --config-factory=org.apache.samza.config.factories.PropertiesConfigFactory --config-path=file://$PWD/deploy/samza/config/wikipedia-parser.properties
+ deploy/samza/bin/run-app.sh --config-factory=org.apache.samza.config.factories.PropertiesConfigFactory --config-path=file://$PWD/deploy/samza/config/wikipedia-stats.properties
+ ```
+ 
+ Once the jobs are started, you can use the same _kafka-console-consumer.sh_ command as in the high-level API Wikipedia example to check out the output of the statistics.
+ 
+ #### 4. Run all the examples as Integration Test
+ 
+ Every example above are ran with a few messages as Integration test using TestRunner API. You can find all the testing samples in [src/test/java](https://github.com/apache/samza-hello-samza/tree/master/src/test/java). To run it use:
+ 
+ ```
+ mvn clean package
+ ```
+ 
+ Run Single example as test use:
+ 
+ ```
+ mvn test -Dtest=<ClassName>
+ ```
  
 +### Instructions
 +
 +The **Hello Samza** project contains example Samza applications of high-level API as well as low-level API. The following are the instructions to install the binaries and run the applications in a local Yarn cluster. See also [Hello Samza](http://samza.apache.org/startup/hello-samza/0.13/) and [Hello Samza High Level API](http://samza.apache.org/learn/tutorials/latest/hello-samza-high-level-yarn.html) for more information.
 +
 +#### 1. Get the Code
 +
 +Check out the hello-samza project:
 +
 +```
 +git clone https://git.apache.org/samza-hello-samza.git hello-samza
 +cd hello-samza
 +```
 +
 +To build hello-samza with the latest Samza master, you can switch to the _latest_ branch:
 +
 +```
 +git checkout latest
 +```
 +
 +This project contains everything you'll need to run your first Samza application.
 +
 +#### 2. Start a Grid
 +
 +A Samza grid usually comprises three different systems: [YARN](http://hadoop.apache.org/docs/current/hadoop-yarn/hadoop-yarn-site/YARN.html), [Kafka](http://kafka.apache.org/), and [ZooKeeper](http://zookeeper.apache.org/). The hello-samza project comes with a script called "grid" to help you setup these systems. Start by running:
 +
 +```
 +./bin/grid bootstrap
 +```
 +
 +This command will download, install, and start ZooKeeper, Kafka, and YARN. It will also check out the latest version of Samza and build it. All package files will be put in a sub-directory called "deploy" inside hello-samza's root folder.
 +
 +If you get a complaint that _JAVA_HOME_ is not set, then you'll need to set it to the path where Java is installed on your system.
 +
 +Once the grid command completes, you can verify that YARN is up and running by going to [http://localhost:8088](http://localhost:8088). This is the YARN UI.
 +
 +#### 3. Build a Samza Application Package
 +
 +Before you can run a Samza application, you need to build a package for it. This package is what YARN uses to deploy your apps on the grid. Use the following command in hello-samza project to build and deploy the example applications:
 +
 +```
 +./bin/deploy.sh
 +```
 +
 +#### 4. Run a Samza Application
 +
 +After you've built your Samza package, you can start the example applications on the grid.
 +
 +##### - High-level API Examples
 +
 +Package [samza.examples.cookbook](https://github.com/apache/samza-hello-samza/tree/master/src/main/java/samza/examples/cookbook) contains various examples of high-level API operator usage, such as map, partitionBy, window and join. Each example is a runnable Samza application with the steps in the class javadocs, e.g [PageViewAdClickJoiner](https://github.com/apache/samza-hello-samza/blob/master/src/main/java/samza/examples/cookbook/PageViewAdClickJoiner.java).
 +
 +Package [samza.examples.wikipedia.application](https://github.com/apache/samza-hello-samza/tree/master/src/main/java/samza/examples/wikipedia/application) contains a small Samza application which consumes the real-time feeds from Wikipedia, extracts the metadata of the events, and calculates statistics of all edits in a 10-second window. You can start the app on the grid using the run-app.sh script:
 +
 +```
 +./deploy/samza/bin/run-app.sh --config-factory=org.apache.samza.config.factories.PropertiesConfigFactory --config-path=file://$PWD/deploy/samza/config/wikipedia-application.properties
 +```
 +
 +Once the job is started, we can tail the kafka topic by:
 +
 +```
 +./deploy/kafka/bin/kafka-console-consumer.sh  --zookeeper localhost:2181 --topic wikipedia-stats
 +```
 +
 +A code walkthrough of this application can be found [here](http://samza.apache.org/learn/tutorials/latest/hello-samza-high-level-code.html).
 +
 +##### - Low-level API Examples
 +
 +Package [samza.examples.wikipedia.task](https://github.com/apache/samza-hello-samza/tree/master/src/main/java/samza/examples/wikipedia/task) contains the low-level API Samza code for the Wikipedia example. To run it, use the following scripts:
 +
 +```
 +deploy/samza/bin/run-app.sh --config-factory=org.apache.samza.config.factories.PropertiesConfigFactory --config-path=file://$PWD/deploy/samza/config/wikipedia-feed.properties
 +deploy/samza/bin/run-app.sh --config-factory=org.apache.samza.config.factories.PropertiesConfigFactory --config-path=file://$PWD/deploy/samza/config/wikipedia-parser.properties
 +deploy/samza/bin/run-app.sh --config-factory=org.apache.samza.config.factories.PropertiesConfigFactory --config-path=file://$PWD/deploy/samza/config/wikipedia-stats.properties
 +```
 +
 +Once the jobs are started, you can use the same _kafka-console-consumer.sh_ command as in the high-level API Wikipedia example to check out the output of the statistics.
 +
  ### Contribution
  
- To start contributing on [Hello Samza](http://samza.apache.org/startup/hello-samza/0.13/) first read [Rules](http://samza.apache.org/contribute/rules.html) and [Contributor Corner](https://cwiki.apache.org/confluence/display/SAMZA/Contributor%27s+Corner). Notice that [Hello Samza](http://samza.apache.org/startup/hello-samza/0.13/) git repository does not support git pull request.
+ To start contributing on [Hello Samza](http://samza.apache.org/startup/hello-samza/latest/) first read [Rules](http://samza.apache.org/contribute/rules.html) and [Contributor Corner](https://cwiki.apache.org/confluence/display/SAMZA/Contributor%27s+Corner). Notice that [Hello Samza](http://samza.apache.org/startup/hello-samza/latest/) git repository does not support git pull request.

http://git-wip-us.apache.org/repos/asf/samza-hello-samza/blob/82905901/bin/deploy.sh
----------------------------------------------------------------------
diff --cc bin/deploy.sh
index 9526067,3c3ada2..d6ce59c
--- a/bin/deploy.sh
+++ b/bin/deploy.sh
@@@ -23,4 -23,4 +23,4 @@@ base_dir=`pwd
  
  mvn clean package
  mkdir -p $base_dir/deploy/samza
- tar -xvf $base_dir/target/hello-samza-0.14.0-dist.tar.gz -C $base_dir/deploy/samza
 -tar -xvf $base_dir/target/hello-samza-1.0.0-SNAPSHOT-dist.tar.gz -C $base_dir/deploy/samza
++tar -xvf $base_dir/target/hello-samza-1.0.0-dist.tar.gz -C $base_dir/deploy/samza

http://git-wip-us.apache.org/repos/asf/samza-hello-samza/blob/82905901/build.gradle
----------------------------------------------------------------------
diff --cc build.gradle
index cce7699,80dafea..143995d
--- a/build.gradle
+++ b/build.gradle
@@@ -48,18 -56,16 +56,18 @@@ dependencies 
      compile(group: 'org.schwering', name: 'irclib', version: '1.10')
      compile(group: 'org.apache.samza', name: 'samza-api', version: "$SAMZA_VERSION")
      compile(group: 'org.apache.samza', name: 'samza-kv_2.11', version: "$SAMZA_VERSION")
-     compile(group: 'org.apache.samza', name: 'samza-aws', version: "$SAMZA_VERSION")
-     compile(group: 'org.apache.samza', name: 'samza-sql', version: "$SAMZA_VERSION")
 +
+     compile(group: 'org.apache.samza', name: 'samza-test_2.11', version: "$SAMZA_VERSION")
+     compile(group: 'org.apache.samza', name: 'samza-kafka_2.11', version: "$SAMZA_VERSION")
+     compile(group: 'org.apache.samza', name: 'samza-kv-rocksdb_2.11', version: "$SAMZA_VERSION")
+     compile(group: 'org.apache.samza', name: 'samza-azure', version: "$SAMZA_VERSION")
+     testCompile(group: 'junit', name: 'junit', version: "4.12")
      explode (group: 'org.apache.samza', name: 'samza-shell',  ext: 'tgz', classifier: 'dist', version: "$SAMZA_VERSION")
- 
      runtime(group: 'org.apache.samza', name: 'samza-core_2.11', version: "$SAMZA_VERSION")
      runtime(group: 'org.apache.samza', name: 'samza-log4j', version: "$SAMZA_VERSION")
 +
      runtime(group: 'org.apache.samza', name: 'samza-shell', version: "$SAMZA_VERSION")
      runtime(group: 'org.apache.samza', name: 'samza-yarn_2.11', version: "$SAMZA_VERSION")
-     runtime(group: 'org.apache.samza', name: 'samza-kv-rocksdb_2.11', version: "$SAMZA_VERSION")
-     runtime(group: 'org.apache.samza', name: 'samza-kafka_2.11', version: "$SAMZA_VERSION")
      runtime(group: 'org.apache.kafka', name: 'kafka_2.11', version: "$KAFKA_VERSION")
      runtime(group: 'org.apache.hadoop', name: 'hadoop-hdfs', version: "$HADOOP_VERSION")
  }

http://git-wip-us.apache.org/repos/asf/samza-hello-samza/blob/82905901/pom.xml
----------------------------------------------------------------------
diff --cc pom.xml
index 2065d7a,41ff462..230fb50
--- a/pom.xml
+++ b/pom.xml
@@@ -27,7 -27,7 +27,7 @@@ under the License
  
    <groupId>org.apache.samza</groupId>
    <artifactId>hello-samza</artifactId>
-   <version>0.14.0</version>
 -  <version>1.0.0-SNAPSHOT</version>
++  <version>1.0.0</version>
    <packaging>jar</packaging>
    <name>Samza Example</name>
    <description>