You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by om...@apache.org on 2018/05/02 17:31:10 UTC
[1/3] orc git commit: ORC-354. Restore the benchmark module.
Repository: orc
Updated Branches:
refs/heads/master 18083fe28 -> 48ba9241c
http://git-wip-us.apache.org/repos/asf/orc/blob/48ba9241/java/bench/src/main/resources/github.schema
----------------------------------------------------------------------
diff --git a/java/bench/src/main/resources/github.schema b/java/bench/src/main/resources/github.schema
new file mode 100644
index 0000000..3b7dd15
--- /dev/null
+++ b/java/bench/src/main/resources/github.schema
@@ -0,0 +1,702 @@
+struct<
+ actor:struct <
+ avatar_url: string,
+ gravatar_id: string,
+ id: int,
+ login: string,
+ url: string>,
+ created_at:timestamp,
+ id:binary,
+ org:struct <
+ avatar_url: string,
+ gravatar_id: string,
+ id: int,
+ login: string,
+ url: string>,
+ payload:struct <
+ action: string,
+ before: binary,
+ comment: struct <
+ _links: struct <
+ html: struct <
+ href: string>,
+ pull_request: struct <
+ href: string>,
+ self: struct <
+ href: string>>,
+ body: string,
+ commit_id: binary,
+ created_at: timestamp,
+ diff_hunk: string,
+ html_url: string,
+ id: int,
+ issue_url: string,
+ line: int,
+ original_commit_id: binary,
+ original_position: int,
+ path: string,
+ position: int,
+ pull_request_url: string,
+ updated_at: timestamp,
+ url: string,
+ user: struct <
+ avatar_url: string,
+ events_url: string,
+ followers_url: string,
+ following_url: string,
+ gists_url: string,
+ gravatar_id: string,
+ html_url: string,
+ id: int,
+ login: string,
+ organizations_url: string,
+ received_events_url: string,
+ repos_url: string,
+ site_admin: boolean,
+ starred_url: string,
+ subscriptions_url: string,
+ type: string,
+ url: string>>,
+ commits: array <struct <
+ author: struct <
+ email: string,
+ name: string>,
+ distinct: boolean,
+ message: string,
+ sha: binary,
+ url: string>>,
+ description: string,
+ distinct_size: int,
+ forkee: struct <
+ archive_url: string,
+ assignees_url: string,
+ blobs_url: string,
+ branches_url: string,
+ clone_url: string,
+ collaborators_url: string,
+ comments_url: string,
+ commits_url: string,
+ compare_url: string,
+ contents_url: string,
+ contributors_url: string,
+ created_at: timestamp,
+ default_branch: string,
+ description: string,
+ downloads_url: string,
+ events_url: string,
+ fork: boolean,
+ forks: int,
+ forks_count: int,
+ forks_url: string,
+ full_name: string,
+ git_commits_url: string,
+ git_refs_url: string,
+ git_tags_url: string,
+ git_url: string,
+ has_downloads: boolean,
+ has_issues: boolean,
+ has_pages: boolean,
+ has_wiki: boolean,
+ homepage: string,
+ hooks_url: string,
+ html_url: string,
+ id: int,
+ issue_comment_url: string,
+ issue_events_url: string,
+ issues_url: string,
+ keys_url: string,
+ labels_url: string,
+ language: string,
+ languages_url: string,
+ merges_url: string,
+ milestones_url: string,
+ mirror_url: string,
+ name: string,
+ notifications_url: string,
+ open_issues: int,
+ open_issues_count: int,
+ owner: struct <
+ avatar_url: string,
+ events_url: string,
+ followers_url: string,
+ following_url: string,
+ gists_url: string,
+ gravatar_id: string,
+ html_url: string,
+ id: int,
+ login: string,
+ organizations_url: string,
+ received_events_url: string,
+ repos_url: string,
+ site_admin: boolean,
+ starred_url: string,
+ subscriptions_url: string,
+ type: string,
+ url: string>,
+ private: boolean,
+ public: boolean,
+ pulls_url: string,
+ pushed_at: timestamp,
+ releases_url: string,
+ size: int,
+ ssh_url: string,
+ stargazers_count: int,
+ stargazers_url: string,
+ statuses_url: string,
+ subscribers_url: string,
+ subscription_url: string,
+ svn_url: string,
+ tags_url: string,
+ teams_url: string,
+ trees_url: string,
+ updated_at: timestamp,
+ url: string,
+ watchers: int,
+ watchers_count: int>,
+ head: binary,
+ issue: struct <
+ assignee: struct <
+ avatar_url: string,
+ events_url: string,
+ followers_url: string,
+ following_url: string,
+ gists_url: string,
+ gravatar_id: string,
+ html_url: string,
+ id: int,
+ login: string,
+ organizations_url: string,
+ received_events_url: string,
+ repos_url: string,
+ site_admin: boolean,
+ starred_url: string,
+ subscriptions_url: string,
+ type: string,
+ url: string>,
+ body: string,
+ closed_at: timestamp,
+ comments: int,
+ comments_url: string,
+ created_at: timestamp,
+ events_url: string,
+ html_url: string,
+ id: int,
+ labels: array <struct <
+ color: binary,
+ name: string,
+ url: string>>,
+ labels_url: string,
+ locked: boolean,
+ milestone: struct <
+ closed_at: timestamp,
+ closed_issues: int,
+ created_at: timestamp,
+ creator: struct <
+ avatar_url: string,
+ events_url: string,
+ followers_url: string,
+ following_url: string,
+ gists_url: string,
+ gravatar_id: string,
+ html_url: string,
+ id: int,
+ login: string,
+ organizations_url: string,
+ received_events_url: string,
+ repos_url: string,
+ site_admin: boolean,
+ starred_url: string,
+ subscriptions_url: string,
+ type: string,
+ url: string>,
+ description: string,
+ due_on: timestamp,
+ html_url: string,
+ id: int,
+ labels_url: string,
+ number: int,
+ open_issues: int,
+ state: string,
+ title: string,
+ updated_at: timestamp,
+ url: string>,
+ number: int,
+ pull_request: struct <
+ diff_url: string,
+ html_url: string,
+ patch_url: string,
+ url: string>,
+ state: string,
+ title: string,
+ updated_at: timestamp,
+ url: string,
+ user: struct <
+ avatar_url: string,
+ events_url: string,
+ followers_url: string,
+ following_url: string,
+ gists_url: string,
+ gravatar_id: string,
+ html_url: string,
+ id: int,
+ login: string,
+ organizations_url: string,
+ received_events_url: string,
+ repos_url: string,
+ site_admin: boolean,
+ starred_url: string,
+ subscriptions_url: string,
+ type: string,
+ url: string>>,
+ master_branch: string,
+ member: struct <
+ avatar_url: string,
+ events_url: string,
+ followers_url: string,
+ following_url: string,
+ gists_url: string,
+ gravatar_id: string,
+ html_url: string,
+ id: int,
+ login: string,
+ organizations_url: string,
+ received_events_url: string,
+ repos_url: string,
+ site_admin: boolean,
+ starred_url: string,
+ subscriptions_url: string,
+ type: string,
+ url: string>,
+ number: int,
+ pages: array <struct <
+ action: string,
+ html_url: string,
+ page_name: string,
+ sha: binary,
+ summary: string,
+ title: string>>,
+ pull_request: struct <
+ _links: struct <
+ comments: struct <
+ href: string>,
+ commits: struct <
+ href: string>,
+ html: struct <
+ href: string>,
+ issue: struct <
+ href: string>,
+ review_comment: struct <
+ href: string>,
+ review_comments: struct <
+ href: string>,
+ self: struct <
+ href: string>,
+ statuses: struct <
+ href: string>>,
+ additions: int,
+ assignee: struct <
+ avatar_url: string,
+ events_url: string,
+ followers_url: string,
+ following_url: string,
+ gists_url: string,
+ gravatar_id: string,
+ html_url: string,
+ id: int,
+ login: string,
+ organizations_url: string,
+ received_events_url: string,
+ repos_url: string,
+ site_admin: boolean,
+ starred_url: string,
+ subscriptions_url: string,
+ type: string,
+ url: string>,
+ base: struct <
+ label: string,
+ ref: string,
+ repo: struct <
+ archive_url: string,
+ assignees_url: string,
+ blobs_url: string,
+ branches_url: string,
+ clone_url: string,
+ collaborators_url: string,
+ comments_url: string,
+ commits_url: string,
+ compare_url: string,
+ contents_url: string,
+ contributors_url: string,
+ created_at: timestamp,
+ default_branch: string,
+ description: string,
+ downloads_url: string,
+ events_url: string,
+ fork: boolean,
+ forks: int,
+ forks_count: int,
+ forks_url: string,
+ full_name: string,
+ git_commits_url: string,
+ git_refs_url: string,
+ git_tags_url: string,
+ git_url: string,
+ has_downloads: boolean,
+ has_issues: boolean,
+ has_pages: boolean,
+ has_wiki: boolean,
+ homepage: string,
+ hooks_url: string,
+ html_url: string,
+ id: int,
+ issue_comment_url: string,
+ issue_events_url: string,
+ issues_url: string,
+ keys_url: string,
+ labels_url: string,
+ language: string,
+ languages_url: string,
+ merges_url: string,
+ milestones_url: string,
+ mirror_url: string,
+ name: string,
+ notifications_url: string,
+ open_issues: int,
+ open_issues_count: int,
+ owner: struct <
+ avatar_url: string,
+ events_url: string,
+ followers_url: string,
+ following_url: string,
+ gists_url: string,
+ gravatar_id: string,
+ html_url: string,
+ id: int,
+ login: string,
+ organizations_url: string,
+ received_events_url: string,
+ repos_url: string,
+ site_admin: boolean,
+ starred_url: string,
+ subscriptions_url: string,
+ type: string,
+ url: string>,
+ private: boolean,
+ pulls_url: string,
+ pushed_at: timestamp,
+ releases_url: string,
+ size: int,
+ ssh_url: string,
+ stargazers_count: int,
+ stargazers_url: string,
+ statuses_url: string,
+ subscribers_url: string,
+ subscription_url: string,
+ svn_url: string,
+ tags_url: string,
+ teams_url: string,
+ trees_url: string,
+ updated_at: timestamp,
+ url: string,
+ watchers: int,
+ watchers_count: int>,
+ sha: binary,
+ user: struct <
+ avatar_url: string,
+ events_url: string,
+ followers_url: string,
+ following_url: string,
+ gists_url: string,
+ gravatar_id: string,
+ html_url: string,
+ id: int,
+ login: string,
+ organizations_url: string,
+ received_events_url: string,
+ repos_url: string,
+ site_admin: boolean,
+ starred_url: string,
+ subscriptions_url: string,
+ type: string,
+ url: string>>,
+ body: string,
+ changed_files: int,
+ closed_at: timestamp,
+ comments: int,
+ comments_url: string,
+ commits: int,
+ commits_url: string,
+ created_at: timestamp,
+ deletions: int,
+ diff_url: string,
+ head: struct <
+ label: string,
+ ref: string,
+ repo: struct <
+ archive_url: string,
+ assignees_url: string,
+ blobs_url: string,
+ branches_url: string,
+ clone_url: string,
+ collaborators_url: string,
+ comments_url: string,
+ commits_url: string,
+ compare_url: string,
+ contents_url: string,
+ contributors_url: string,
+ created_at: timestamp,
+ default_branch: string,
+ description: string,
+ downloads_url: string,
+ events_url: string,
+ fork: boolean,
+ forks: int,
+ forks_count: int,
+ forks_url: string,
+ full_name: string,
+ git_commits_url: string,
+ git_refs_url: string,
+ git_tags_url: string,
+ git_url: string,
+ has_downloads: boolean,
+ has_issues: boolean,
+ has_pages: boolean,
+ has_wiki: boolean,
+ homepage: string,
+ hooks_url: string,
+ html_url: string,
+ id: int,
+ issue_comment_url: string,
+ issue_events_url: string,
+ issues_url: string,
+ keys_url: string,
+ labels_url: string,
+ language: string,
+ languages_url: string,
+ merges_url: string,
+ milestones_url: string,
+ mirror_url: string,
+ name: string,
+ notifications_url: string,
+ open_issues: int,
+ open_issues_count: int,
+ owner: struct <
+ avatar_url: string,
+ events_url: string,
+ followers_url: string,
+ following_url: string,
+ gists_url: string,
+ gravatar_id: string,
+ html_url: string,
+ id: int,
+ login: string,
+ organizations_url: string,
+ received_events_url: string,
+ repos_url: string,
+ site_admin: boolean,
+ starred_url: string,
+ subscriptions_url: string,
+ type: string,
+ url: string>,
+ private: boolean,
+ pulls_url: string,
+ pushed_at: timestamp,
+ releases_url: string,
+ size: int,
+ ssh_url: string,
+ stargazers_count: int,
+ stargazers_url: string,
+ statuses_url: string,
+ subscribers_url: string,
+ subscription_url: string,
+ svn_url: string,
+ tags_url: string,
+ teams_url: string,
+ trees_url: string,
+ updated_at: timestamp,
+ url: string,
+ watchers: int,
+ watchers_count: int>,
+ sha: binary,
+ user: struct <
+ avatar_url: string,
+ events_url: string,
+ followers_url: string,
+ following_url: string,
+ gists_url: string,
+ gravatar_id: string,
+ html_url: string,
+ id: int,
+ login: string,
+ organizations_url: string,
+ received_events_url: string,
+ repos_url: string,
+ site_admin: boolean,
+ starred_url: string,
+ subscriptions_url: string,
+ type: string,
+ url: string>>,
+ html_url: string,
+ id: int,
+ issue_url: string,
+ locked: boolean,
+ merge_commit_sha: string,
+ mergeable: boolean,
+ mergeable_state: string,
+ merged: boolean,
+ merged_at: timestamp,
+ merged_by: struct <
+ avatar_url: string,
+ events_url: string,
+ followers_url: string,
+ following_url: string,
+ gists_url: string,
+ gravatar_id: string,
+ html_url: string,
+ id: int,
+ login: string,
+ organizations_url: string,
+ received_events_url: string,
+ repos_url: string,
+ site_admin: boolean,
+ starred_url: string,
+ subscriptions_url: string,
+ type: string,
+ url: string>,
+ milestone: struct <
+ closed_at: timestamp,
+ closed_issues: int,
+ created_at: timestamp,
+ creator: struct <
+ avatar_url: string,
+ events_url: string,
+ followers_url: string,
+ following_url: string,
+ gists_url: string,
+ gravatar_id: string,
+ html_url: string,
+ id: int,
+ login: string,
+ organizations_url: string,
+ received_events_url: string,
+ repos_url: string,
+ site_admin: boolean,
+ starred_url: string,
+ subscriptions_url: string,
+ type: string,
+ url: string>,
+ description: string,
+ due_on: timestamp,
+ html_url: string,
+ id: int,
+ labels_url: string,
+ number: int,
+ open_issues: int,
+ state: string,
+ title: string,
+ updated_at: timestamp,
+ url: string>,
+ number: int,
+ patch_url: string,
+ review_comment_url: string,
+ review_comments: int,
+ review_comments_url: string,
+ state: string,
+ statuses_url: string,
+ title: string,
+ updated_at: timestamp,
+ url: string,
+ user: struct <
+ avatar_url: string,
+ events_url: string,
+ followers_url: string,
+ following_url: string,
+ gists_url: string,
+ gravatar_id: string,
+ html_url: string,
+ id: int,
+ login: string,
+ organizations_url: string,
+ received_events_url: string,
+ repos_url: string,
+ site_admin: boolean,
+ starred_url: string,
+ subscriptions_url: string,
+ type: string,
+ url: string>>,
+ push_id: int,
+ pusher_type: string,
+ ref: string,
+ ref_type: string,
+ release: struct <
+ assets: array <struct <
+ browser_download_url: string,
+ content_type: string,
+ created_at: timestamp,
+ download_count: int,
+ id: int,
+ label: string,
+ name: string,
+ size: int,
+ state: string,
+ updated_at: timestamp,
+ uploader: struct <
+ avatar_url: string,
+ events_url: string,
+ followers_url: string,
+ following_url: string,
+ gists_url: string,
+ gravatar_id: string,
+ html_url: string,
+ id: int,
+ login: string,
+ organizations_url: string,
+ received_events_url: string,
+ repos_url: string,
+ site_admin: boolean,
+ starred_url: string,
+ subscriptions_url: string,
+ type: string,
+ url: string>,
+ url: string>>,
+ assets_url: string,
+ author: struct <
+ avatar_url: string,
+ events_url: string,
+ followers_url: string,
+ following_url: string,
+ gists_url: string,
+ gravatar_id: string,
+ html_url: string,
+ id: int,
+ login: string,
+ organizations_url: string,
+ received_events_url: string,
+ repos_url: string,
+ site_admin: boolean,
+ starred_url: string,
+ subscriptions_url: string,
+ type: string,
+ url: string>,
+ body: string,
+ created_at: timestamp,
+ draft: boolean,
+ html_url: string,
+ id: int,
+ name: string,
+ prerelease: boolean,
+ published_at: timestamp,
+ tag_name: string,
+ tarball_url: string,
+ target_commitish: string,
+ upload_url: string,
+ url: string,
+ zipball_url: string>,
+ size: int>,
+ public: boolean,
+ repo: struct <
+ id: int,
+ name: string,
+ url: string>,
+ type: string
+>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/orc/blob/48ba9241/java/bench/src/main/resources/log4j.properties
----------------------------------------------------------------------
diff --git a/java/bench/src/main/resources/log4j.properties b/java/bench/src/main/resources/log4j.properties
new file mode 100644
index 0000000..625331e
--- /dev/null
+++ b/java/bench/src/main/resources/log4j.properties
@@ -0,0 +1,18 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+log4j.rootLogger=WARN, CONSOLE
+
+# CONSOLE is set to be a ConsoleAppender using a PatternLayout
+log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender
+log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout
+log4j.appender.CONSOLE.layout.ConversionPattern=[%-5p] %m%n
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/orc/blob/48ba9241/java/bench/src/main/resources/sales.schema
----------------------------------------------------------------------
diff --git a/java/bench/src/main/resources/sales.schema b/java/bench/src/main/resources/sales.schema
new file mode 100644
index 0000000..df96409
--- /dev/null
+++ b/java/bench/src/main/resources/sales.schema
@@ -0,0 +1,56 @@
+struct<
+ sales_id:bigint,
+ customer_id:bigint,
+ col3:bigint,
+ item_category:bigint,
+ item_count:bigint,
+ change_ts:timestamp,
+ store_location:string,
+ associate_id:string,
+ col9:bigint,
+ rebate_id:string,
+ create_ts:timestamp,
+ col13:bigint,
+ size:string,
+ col14:bigint,
+ fulfilled:boolean,
+ global_id:string,
+ col17:string,
+ col18:string,
+ col19:bigint,
+ has_rebate:boolean,
+ col21:array<
+ struct<
+ sub1:bigint,
+ sub2:string,
+ sub3:string,
+ sub4:bigint,
+ sub5:bigint,
+ sub6:string>>,
+ vendor_id:string,
+ country:string,
+ backend_version:string,
+ col41:bigint,
+ col42:bigint,
+ col43:bigint,
+ col44:bigint,
+ col45:bigint,
+ col46:bigint,
+ col47:bigint,
+ col48:bigint,
+ col49:string,
+ col50:string,
+ col51:bigint,
+ col52:bigint,
+ col53:bigint,
+ col54:bigint,
+ col55:string,
+ col56:timestamp,
+ col57:timestamp,
+ md5:bigint,
+ col59:bigint,
+ col69:timestamp,
+ col61:string,
+ col62:string,
+ col63:timestamp,
+ col64:bigint>
http://git-wip-us.apache.org/repos/asf/orc/blob/48ba9241/java/bench/src/main/resources/taxi.schema
----------------------------------------------------------------------
diff --git a/java/bench/src/main/resources/taxi.schema b/java/bench/src/main/resources/taxi.schema
new file mode 100644
index 0000000..5eb7c0f
--- /dev/null
+++ b/java/bench/src/main/resources/taxi.schema
@@ -0,0 +1,21 @@
+struct<
+ vendor_id:int,
+ pickup_time: timestamp,
+ dropoff_time: timestamp,
+ passenger_count: int,
+ trip_distance: double,
+ pickup_longitude: double,
+ pickup_latitude: double,
+ ratecode_id: int,
+ store_and_fwd_flag: string,
+ dropoff_longitude: double,
+ dropoff_latitude: double,
+ payment_type: int,
+ fare_amount: decimal(8,2),
+ extra: decimal(8,2),
+ mta_tax: decimal(8,2),
+ tip_amount: decimal(8,2),
+ tolls_amount: decimal(8,2),
+ improvement_surcharge : decimal(8,2),
+ total_amount: decimal(8,2)
+>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/orc/blob/48ba9241/java/pom.xml
----------------------------------------------------------------------
diff --git a/java/pom.xml b/java/pom.xml
index 108b637..67062bc 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -332,6 +332,13 @@
</plugins>
</build>
</profile>
+ <profile>
+ <!-- a developer profile to build some benchmarks -->
+ <id>benchmark</id>
+ <modules>
+ <module>bench</module>
+ </modules>
+ </profile>
</profiles>
<dependencyManagement>
@@ -375,11 +382,6 @@
<version>11.0.2</version>
</dependency>
<dependency>
- <groupId>com.fasterxml.jackson.core</groupId>
- <artifactId>jackson-core</artifactId>
- <version>2.8.4</version>
- </dependency>
- <dependency>
<groupId>com.google.protobuf</groupId>
<artifactId>protobuf-java</artifactId>
<version>2.5.0</version>
@@ -413,7 +415,7 @@
<dependency>
<groupId>io.airlift</groupId>
<artifactId>aircompressor</artifactId>
- <version>0.8</version>
+ <version>0.10</version>
<exclusions>
<exclusion>
<groupId>io.airlift</groupId>
@@ -608,17 +610,6 @@
</exclusions>
</dependency>
<dependency>
- <groupId>org.jodd</groupId>
- <artifactId>jodd-core</artifactId>
- <version>3.5.2</version>
- <scope>runtime</scope>
- </dependency>
- <dependency>
- <groupId>org.openjdk.jmh</groupId>
- <artifactId>jmh-core</artifactId>
- <version>1.18</version>
- </dependency>
- <dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.7.5</version>
[3/3] orc git commit: ORC-354. Restore the benchmark module.
Posted by om...@apache.org.
ORC-354. Restore the benchmark module.
This reverts commit b86d70aa73289b86e066cc019ea11e0d83c1e40d.
Signed-off-by: Owen O'Malley <om...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/orc/repo
Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/48ba9241
Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/48ba9241
Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/48ba9241
Branch: refs/heads/master
Commit: 48ba9241cd5984ce4fa32c79f729f7c20623351d
Parents: 18083fe
Author: Owen O'Malley <om...@apache.org>
Authored: Fri Apr 27 16:58:48 2018 -0700
Committer: Owen O'Malley <om...@apache.org>
Committed: Wed May 2 10:28:40 2018 -0700
----------------------------------------------------------------------
java/CMakeLists.txt | 2 +-
java/bench/.gitignore | 5 +
java/bench/README.md | 33 +
java/bench/fetch-data.sh | 21 +
java/bench/pom.xml | 217 ++++++
java/bench/src/assembly/uber.xml | 33 +
java/bench/src/findbugs/exclude.xml | 25 +
.../hadoop/fs/TrackingLocalFileSystem.java | 57 ++
.../hive/ql/io/orc/OrcBenchmarkUtilities.java | 54 ++
.../orc/bench/ColumnProjectionBenchmark.java | 188 +++++
.../org/apache/orc/bench/CompressionKind.java | 87 +++
.../src/java/org/apache/orc/bench/Driver.java | 78 +++
.../org/apache/orc/bench/FullReadBenchmark.java | 223 ++++++
.../org/apache/orc/bench/RandomGenerator.java | 524 ++++++++++++++
.../org/apache/orc/bench/SalesGenerator.java | 206 ++++++
.../java/org/apache/orc/bench/Utilities.java | 127 ++++
.../apache/orc/bench/convert/BatchReader.java | 34 +
.../apache/orc/bench/convert/BatchWriter.java | 34 +
.../orc/bench/convert/GenerateVariants.java | 220 ++++++
.../apache/orc/bench/convert/ScanVariants.java | 87 +++
.../orc/bench/convert/avro/AvroReader.java | 299 ++++++++
.../orc/bench/convert/avro/AvroSchemaUtils.java | 192 +++++
.../orc/bench/convert/avro/AvroWriter.java | 363 ++++++++++
.../apache/orc/bench/convert/csv/CsvReader.java | 175 +++++
.../orc/bench/convert/json/JsonReader.java | 279 ++++++++
.../orc/bench/convert/json/JsonWriter.java | 217 ++++++
.../apache/orc/bench/convert/orc/OrcReader.java | 50 ++
.../apache/orc/bench/convert/orc/OrcWriter.java | 54 ++
.../bench/convert/parquet/ParquetReader.java | 297 ++++++++
.../bench/convert/parquet/ParquetWriter.java | 86 +++
java/bench/src/main/resources/github.schema | 702 +++++++++++++++++++
java/bench/src/main/resources/log4j.properties | 18 +
java/bench/src/main/resources/sales.schema | 56 ++
java/bench/src/main/resources/taxi.schema | 21 +
java/pom.xml | 25 +-
35 files changed, 5071 insertions(+), 18 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/orc/blob/48ba9241/java/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/java/CMakeLists.txt b/java/CMakeLists.txt
index 6845ae1..e82898c 100644
--- a/java/CMakeLists.txt
+++ b/java/CMakeLists.txt
@@ -23,7 +23,7 @@ set(ORC_JARS
)
if (ANALYZE_JAVA)
- set(JAVA_PROFILE "-Pcmake,analyze")
+ set(JAVA_PROFILE "-Pcmake,analyze,benchmark")
else()
set(JAVA_PROFILE "-Pcmake")
endif()
http://git-wip-us.apache.org/repos/asf/orc/blob/48ba9241/java/bench/.gitignore
----------------------------------------------------------------------
diff --git a/java/bench/.gitignore b/java/bench/.gitignore
new file mode 100644
index 0000000..babcae6
--- /dev/null
+++ b/java/bench/.gitignore
@@ -0,0 +1,5 @@
+.*.crc
+*.json.gz
+*.avro
+*.parquet
+*.orc
http://git-wip-us.apache.org/repos/asf/orc/blob/48ba9241/java/bench/README.md
----------------------------------------------------------------------
diff --git a/java/bench/README.md b/java/bench/README.md
new file mode 100644
index 0000000..12cedea
--- /dev/null
+++ b/java/bench/README.md
@@ -0,0 +1,33 @@
+# File Format Benchmarks
+
+These big data file format benchmarks, compare:
+
+* Avro
+* Json
+* ORC
+* Parquet
+
+To build this library:
+
+```% mvn clean package```
+
+To fetch the source data:
+
+```% ./fetch-data.sh```
+
+To generate the derived data:
+
+```% java -jar target/orc-benchmarks-*-uber.jar generate data```
+
+To run a scan of all of the data:
+
+```% java -jar target/orc-benchmarks-*-uber.jar scan data```
+
+To run full read benchmark:
+
+```% java -jar target/orc-benchmarks-*-uber.jar read-all data```
+
+To run column projection benchmark:
+
+```% java -jar target/orc-benchmarks-*-uber.jar read-some data```
+
http://git-wip-us.apache.org/repos/asf/orc/blob/48ba9241/java/bench/fetch-data.sh
----------------------------------------------------------------------
diff --git a/java/bench/fetch-data.sh b/java/bench/fetch-data.sh
new file mode 100755
index 0000000..129d83f
--- /dev/null
+++ b/java/bench/fetch-data.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+mkdir -p data/sources/taxi
+(cd data/sources/taxi; wget https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2015-{11,12}.csv)
+(cd data/sources/taxi; gzip *.csv)
+mkdir -p data/sources/github
+(cd data/sources/github; wget http://data.githubarchive.org/2015-11-{01..15}-{0..23}.json.gz)
http://git-wip-us.apache.org/repos/asf/orc/blob/48ba9241/java/bench/pom.xml
----------------------------------------------------------------------
diff --git a/java/bench/pom.xml b/java/bench/pom.xml
new file mode 100644
index 0000000..148693a
--- /dev/null
+++ b/java/bench/pom.xml
@@ -0,0 +1,217 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+ <parent>
+ <groupId>org.apache</groupId>
+ <artifactId>apache</artifactId>
+ <version>18</version>
+ <relativePath></relativePath>
+ </parent>
+
+ <groupId>org.apache.orc</groupId>
+ <artifactId>orc-benchmarks</artifactId>
+ <version>1.5.0-SNAPSHOT</version>
+ <packaging>jar</packaging>
+ <name>ORC Benchmarks</name>
+ <description>
+ Benchmarks for comparing ORC, Parquet, JSON, and Avro performance.
+ </description>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ <maven.compiler.useIncrementalCompilation>false</maven.compiler.useIncrementalCompilation>
+
+ <avro.version>1.8.2</avro.version>
+ <hadoop.version>2.7.3</hadoop.version>
+ <hive.version>2.3.3</hive.version>
+ <orc.version>1.5.0-SNAPSHOT</orc.version>
+ <parquet.version>1.9.0</parquet.version>
+ <storage-api.version>2.5.0</storage-api.version>
+ <zookeeper.version>3.4.6</zookeeper.version>
+ </properties>
+
+ <dependencies>
+ <dependency>
+ <groupId>com.fasterxml.jackson.core</groupId>
+ <artifactId>jackson-core</artifactId>
+ <version>2.8.4</version>
+ </dependency>
+ <dependency>
+ <groupId>com.google.code.gson</groupId>
+ <artifactId>gson</artifactId>
+ <version>2.2.4</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-cli</groupId>
+ <artifactId>commons-cli</artifactId>
+ <version>1.3.1</version>
+ </dependency>
+ <dependency>
+ <groupId>io.airlift</groupId>
+ <artifactId>aircompressor</artifactId>
+ <version>0.10</version>
+ <exclusions>
+ <exclusion>
+ <groupId>io.airlift</groupId>
+ <artifactId>slice</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.avro</groupId>
+ <artifactId>avro</artifactId>
+ <version>${avro.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.avro</groupId>
+ <artifactId>avro-mapred</artifactId>
+ <classifier>hadoop2</classifier>
+ <version>${avro.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-csv</artifactId>
+ <version>1.4</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-common</artifactId>
+ <version>${hadoop.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-hdfs</artifactId>
+ <version>${hadoop.version}</version>
+ <scope>runtime</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-mapreduce-client-core</artifactId>
+ <version>${hadoop.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hive</groupId>
+ <artifactId>hive-exec</artifactId>
+ <classifier>core</classifier>
+ <version>${hive.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hive</groupId>
+ <artifactId>hive-serde</artifactId>
+ <version>${hive.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hive</groupId>
+ <artifactId>hive-storage-api</artifactId>
+ <version>${storage-api.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.orc</groupId>
+ <artifactId>orc-core</artifactId>
+ <version>${orc.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.parquet</groupId>
+ <artifactId>parquet-hadoop-bundle</artifactId>
+ <version>${parquet.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.jodd</groupId>
+ <artifactId>jodd-core</artifactId>
+ <version>3.5.2</version>
+ <scope>runtime</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.openjdk.jmh</groupId>
+ <artifactId>jmh-core</artifactId>
+ <version>1.18</version>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <sourceDirectory>${basedir}/src/java</sourceDirectory>
+ <testSourceDirectory>${basedir}/src/test</testSourceDirectory>
+ <testResources>
+ <testResource>
+ <directory>${basedir}/src/test/resources</directory>
+ </testResource>
+ </testResources>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-enforcer-plugin</artifactId>
+ <version>3.0.0-M1</version>
+ <executions>
+ <execution>
+ <id>enforce-maven</id>
+ <goals>
+ <goal>enforce</goal>
+ </goals>
+ <configuration>
+ <rules>
+ <requireMavenVersion>
+ <version>2.2.1</version>
+ </requireMavenVersion>
+ </rules>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <version>3.1</version>
+ <configuration>
+ <source>1.7</source>
+ <target>1.7</target>
+ </configuration>
+ </plugin>
+ <plugin>
+ <artifactId>maven-assembly-plugin</artifactId>
+ <configuration>
+ <archive>
+ <manifest>
+ <mainClass>org.apache.orc.bench.Driver</mainClass>
+ </manifest>
+ </archive>
+ <descriptors>
+ <descriptor>src/assembly/uber.xml</descriptor>
+ </descriptors>
+ </configuration>
+ <executions>
+ <execution>
+ <id>make-assembly</id> <!-- this is used for inheritance merges -->
+ <phase>package</phase> <!-- bind to the packaging phase -->
+ <goals>
+ <goal>single</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
+
+ <profiles>
+ <profile>
+ <id>cmake</id>
+ <build>
+ <directory>${build.dir}/bench</directory>
+ </build>
+ </profile>
+ </profiles>
+</project>
http://git-wip-us.apache.org/repos/asf/orc/blob/48ba9241/java/bench/src/assembly/uber.xml
----------------------------------------------------------------------
diff --git a/java/bench/src/assembly/uber.xml b/java/bench/src/assembly/uber.xml
new file mode 100644
index 0000000..014eab9
--- /dev/null
+++ b/java/bench/src/assembly/uber.xml
@@ -0,0 +1,33 @@
+<!--
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<assembly>
+ <id>uber</id>
+ <formats>
+ <format>jar</format>
+ </formats>
+ <includeBaseDirectory>false</includeBaseDirectory>
+ <dependencySets>
+ <dependencySet>
+ <outputDirectory>/</outputDirectory>
+ <useProjectArtifact>true</useProjectArtifact>
+ <unpack>true</unpack>
+ <scope>runtime</scope>
+ </dependencySet>
+ </dependencySets>
+ <containerDescriptorHandlers>
+ <containerDescriptorHandler>
+ <handlerName>metaInf-services</handlerName>
+ </containerDescriptorHandler>
+ </containerDescriptorHandlers>
+</assembly>
http://git-wip-us.apache.org/repos/asf/orc/blob/48ba9241/java/bench/src/findbugs/exclude.xml
----------------------------------------------------------------------
diff --git a/java/bench/src/findbugs/exclude.xml b/java/bench/src/findbugs/exclude.xml
new file mode 100644
index 0000000..dde1471
--- /dev/null
+++ b/java/bench/src/findbugs/exclude.xml
@@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<FindBugsFilter>
+ <Match>
+ <Bug pattern="EI_EXPOSE_REP,EI_EXPOSE_REP2"/>
+ </Match>
+ <Match>
+ <Class name="~org\.openjdk\.jmh\.infra\.generated.*"/>
+ </Match>
+ <Match>
+ <Class name="~org\.apache\.orc\.bench\.generated.*"/>
+ </Match>
+</FindBugsFilter>
http://git-wip-us.apache.org/repos/asf/orc/blob/48ba9241/java/bench/src/java/org/apache/hadoop/fs/TrackingLocalFileSystem.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/hadoop/fs/TrackingLocalFileSystem.java b/java/bench/src/java/org/apache/hadoop/fs/TrackingLocalFileSystem.java
new file mode 100644
index 0000000..0440495
--- /dev/null
+++ b/java/bench/src/java/org/apache/hadoop/fs/TrackingLocalFileSystem.java
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.fs;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+
+public class TrackingLocalFileSystem extends RawLocalFileSystem {
+
+ class TrackingFileInputStream extends RawLocalFileSystem.LocalFSFileInputStream {
+ public TrackingFileInputStream(Path f) throws IOException {
+ super(f);
+ }
+
+ public int read() throws IOException {
+ statistics.incrementReadOps(1);
+ return super.read();
+ }
+
+ public int read(byte[] b, int off, int len) throws IOException {
+ statistics.incrementReadOps(1);
+ return super.read(b, off, len);
+ }
+
+ public int read(long position, byte[] b, int off, int len) throws IOException {
+ statistics.incrementReadOps(1);
+ return super.read(position, b, off, len);
+ }
+ }
+
+ public FSDataInputStream open(Path f, int bufferSize) throws IOException {
+ if (!exists(f)) {
+ throw new FileNotFoundException(f.toString());
+ }
+ return new FSDataInputStream(new BufferedFSInputStream(
+ new TrackingFileInputStream(f), bufferSize));
+ }
+
+ public FileSystem.Statistics getLocalStatistics() {
+ return statistics;
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/48ba9241/java/bench/src/java/org/apache/hadoop/hive/ql/io/orc/OrcBenchmarkUtilities.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/hadoop/hive/ql/io/orc/OrcBenchmarkUtilities.java b/java/bench/src/java/org/apache/hadoop/hive/ql/io/orc/OrcBenchmarkUtilities.java
new file mode 100644
index 0000000..18c5d06
--- /dev/null
+++ b/java/bench/src/java/org/apache/hadoop/hive/ql/io/orc/OrcBenchmarkUtilities.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.io.orc;
+
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
+import org.apache.hadoop.io.Writable;
+import org.apache.orc.OrcProto;
+import org.apache.orc.OrcUtils;
+import org.apache.orc.TypeDescription;
+
+import java.util.List;
+
+/**
+ * Utilities that need the non-public methods from Hive.
+ */
+public class OrcBenchmarkUtilities {
+
+ public static StructObjectInspector createObjectInspector(TypeDescription schema) {
+ List<OrcProto.Type> types = OrcUtils.getOrcTypes(schema);
+ return (StructObjectInspector) OrcStruct.createObjectInspector(0, types);
+ }
+
+ public static Writable nextObject(VectorizedRowBatch batch,
+ TypeDescription schema,
+ int rowId,
+ Writable obj) {
+ OrcStruct result = (OrcStruct) obj;
+ if (result == null) {
+ result = new OrcStruct(batch.cols.length);
+ }
+ List<TypeDescription> childrenTypes = schema.getChildren();
+ for(int c=0; c < batch.cols.length; ++c) {
+ result.setFieldValue(c, RecordReaderImpl.nextValue(batch.cols[c], rowId,
+ childrenTypes.get(c), result.getFieldValue(c)));
+ }
+ return result;
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/48ba9241/java/bench/src/java/org/apache/orc/bench/ColumnProjectionBenchmark.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/ColumnProjectionBenchmark.java b/java/bench/src/java/org/apache/orc/bench/ColumnProjectionBenchmark.java
new file mode 100644
index 0000000..4afaaf1
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/ColumnProjectionBenchmark.java
@@ -0,0 +1,188 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.TrackingLocalFileSystem;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport;
+import org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper;
+import org.apache.hadoop.io.ArrayWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.mapred.FileSplit;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.orc.OrcFile;
+import org.apache.orc.Reader;
+import org.apache.orc.RecordReader;
+import org.apache.orc.TypeDescription;
+import org.apache.parquet.hadoop.ParquetInputFormat;
+import org.openjdk.jmh.annotations.AuxCounters;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.TearDown;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.runner.Runner;
+import org.openjdk.jmh.runner.options.OptionsBuilder;
+
+import java.net.URI;
+import java.util.List;
+import java.util.concurrent.TimeUnit;
+
+@BenchmarkMode(Mode.AverageTime)
+@Warmup(iterations=1, time=10, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations=3, time=10, timeUnit = TimeUnit.SECONDS)
+@State(Scope.Thread)
+@OutputTimeUnit(TimeUnit.MICROSECONDS)
+@Fork(1)
+public class ColumnProjectionBenchmark {
+
+ private static final String ROOT_ENVIRONMENT_NAME = "bench.root.dir";
+ private static final Path root;
+ static {
+ String value = System.getProperty(ROOT_ENVIRONMENT_NAME);
+ root = value == null ? null : new Path(value);
+ }
+
+ @Param({ "github", "sales", "taxi"})
+ public String dataset;
+
+ @Param({"none", "snappy", "zlib"})
+ public String compression;
+
+ @AuxCounters
+ @State(Scope.Thread)
+ public static class ExtraCounters {
+ long bytesRead;
+ long reads;
+ long records;
+ long invocations;
+
+ @Setup(Level.Iteration)
+ public void clean() {
+ bytesRead = 0;
+ reads = 0;
+ records = 0;
+ invocations = 0;
+ }
+
+ @TearDown(Level.Iteration)
+ public void print() {
+ System.out.println();
+ System.out.println("Reads: " + reads);
+ System.out.println("Bytes: " + bytesRead);
+ System.out.println("Records: " + records);
+ System.out.println("Invocations: " + invocations);
+ }
+
+ public long kilobytes() {
+ return bytesRead / 1024;
+ }
+
+ public long records() {
+ return records;
+ }
+ }
+
+ @Benchmark
+ public void orc(ExtraCounters counters) throws Exception{
+ Configuration conf = new Configuration();
+ TrackingLocalFileSystem fs = new TrackingLocalFileSystem();
+ fs.initialize(new URI("file:///"), conf);
+ FileSystem.Statistics statistics = fs.getLocalStatistics();
+ statistics.reset();
+ OrcFile.ReaderOptions options = OrcFile.readerOptions(conf).filesystem(fs);
+ Path path = Utilities.getVariant(root, dataset, "orc", compression);
+ Reader reader = OrcFile.createReader(path, options);
+ TypeDescription schema = reader.getSchema();
+ boolean[] include = new boolean[schema.getMaximumId() + 1];
+ // select first two columns
+ List<TypeDescription> children = schema.getChildren();
+ for(int c= children.get(0).getId(); c <= children.get(1).getMaximumId(); ++c) {
+ include[c] = true;
+ }
+ RecordReader rows = reader.rows(new Reader.Options()
+ .include(include));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ while (rows.nextBatch(batch)) {
+ counters.records += batch.size;
+ }
+ rows.close();
+ counters.bytesRead += statistics.getBytesRead();
+ counters.reads += statistics.getReadOps();
+ counters.invocations += 1;
+ }
+
+ @Benchmark
+ public void parquet(ExtraCounters counters) throws Exception {
+ JobConf conf = new JobConf();
+ conf.set("fs.track.impl", TrackingLocalFileSystem.class.getName());
+ conf.set("fs.defaultFS", "track:///");
+ if ("taxi".equals(dataset)) {
+ conf.set("columns", "vendor_id,pickup_time");
+ conf.set("columns.types", "int,timestamp");
+ } else if ("sales".equals(dataset)) {
+ conf.set("columns", "sales_id,customer_id");
+ conf.set("columns.types", "bigint,bigint");
+ } else if ("github".equals(dataset)) {
+ conf.set("columns", "actor,created_at");
+ conf.set("columns.types", "struct<avatar_url:string,gravatar_id:string," +
+ "id:int,login:string,url:string>,timestamp");
+ } else {
+ throw new IllegalArgumentException("Unknown data set " + dataset);
+ }
+ Path path = Utilities.getVariant(root, dataset, "parquet", compression);
+ FileSystem.Statistics statistics = FileSystem.getStatistics("track:///",
+ TrackingLocalFileSystem.class);
+ statistics.reset();
+ ParquetInputFormat<ArrayWritable> inputFormat =
+ new ParquetInputFormat<>(DataWritableReadSupport.class);
+
+ NullWritable nada = NullWritable.get();
+ FileSplit split = new FileSplit(path, 0, Long.MAX_VALUE, new String[]{});
+ org.apache.hadoop.mapred.RecordReader<NullWritable,ArrayWritable> recordReader =
+ new ParquetRecordReaderWrapper(inputFormat, split, conf, Reporter.NULL);
+ ArrayWritable value = recordReader.createValue();
+ while (recordReader.next(nada, value)) {
+ counters.records += 1;
+ }
+ recordReader.close();
+ counters.bytesRead += statistics.getBytesRead();
+ counters.reads += statistics.getReadOps();
+ counters.invocations += 1;
+ }
+ public static void main(String[] args) throws Exception {
+ new Runner(new OptionsBuilder()
+ .include(ColumnProjectionBenchmark.class.getSimpleName())
+ .jvmArgs("-server", "-Xms256m", "-Xmx2g",
+ "-D" + ROOT_ENVIRONMENT_NAME + "=" + args[0]).build()
+ ).run();
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/48ba9241/java/bench/src/java/org/apache/orc/bench/CompressionKind.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/CompressionKind.java b/java/bench/src/java/org/apache/orc/bench/CompressionKind.java
new file mode 100644
index 0000000..9274de3
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/CompressionKind.java
@@ -0,0 +1,87 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench;
+
+import io.airlift.compress.snappy.SnappyCodec;
+import org.apache.hadoop.fs.Path;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.zip.GZIPInputStream;
+import java.util.zip.GZIPOutputStream;
+
+/**
+ * Enum for handling the compression codecs for the benchmark
+ */
+public enum CompressionKind {
+ NONE(".none"),
+ ZLIB(".gz"),
+ SNAPPY(".snappy");
+
+ CompressionKind(String extendsion) {
+ this.extension = extendsion;
+ }
+
+ private final String extension;
+
+ public String getExtension() {
+ return extension;
+ }
+
+ public OutputStream create(OutputStream out) throws IOException {
+ switch (this) {
+ case NONE:
+ return out;
+ case ZLIB:
+ return new GZIPOutputStream(out);
+ case SNAPPY:
+ return new SnappyCodec().createOutputStream(out);
+ default:
+ throw new IllegalArgumentException("Unhandled kind " + this);
+ }
+ }
+
+ public InputStream read(InputStream in) throws IOException {
+ switch (this) {
+ case NONE:
+ return in;
+ case ZLIB:
+ return new GZIPInputStream(in);
+ case SNAPPY:
+ return new SnappyCodec().createInputStream(in);
+ default:
+ throw new IllegalArgumentException("Unhandled kind " + this);
+ }
+ }
+
+ public static CompressionKind fromPath(Path path) {
+ String name = path.getName();
+ int lastDot = name.lastIndexOf('.');
+ if (lastDot >= 0) {
+ String ext = name.substring(lastDot);
+ for (CompressionKind value : values()) {
+ if (ext.equals(value.getExtension())) {
+ return value;
+ }
+ }
+ }
+ return NONE;
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/48ba9241/java/bench/src/java/org/apache/orc/bench/Driver.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/Driver.java b/java/bench/src/java/org/apache/orc/bench/Driver.java
new file mode 100644
index 0000000..c8f1592
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/Driver.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.DefaultParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.orc.bench.convert.GenerateVariants;
+import org.apache.orc.bench.convert.ScanVariants;
+
+import java.util.Arrays;
+
+/**
+ * A driver tool to call the various benchmark classes.
+ */
+public class Driver {
+
+ static CommandLine parseCommandLine(String[] args) throws ParseException {
+ Options options = new Options()
+ .addOption("h", "help", false, "Provide help")
+ .addOption("D", "define", true, "Change configuration settings");
+ CommandLine result = new DefaultParser().parse(options, args, true);
+ if (result.hasOption("help") || result.getArgs().length == 0) {
+ new HelpFormatter().printHelp("benchmark <command>", options);
+ System.err.println();
+ System.err.println("Commands:");
+ System.err.println(" generate - Generate data variants");
+ System.err.println(" scan - Scan data variants");
+ System.err.println(" read-all - Full table scan benchmark");
+ System.err.println(" read-some - Column projection benchmark");
+ System.exit(1);
+ }
+ return result;
+ }
+
+ public static void main(String[] args) throws Exception {
+ CommandLine cli = parseCommandLine(args);
+ args = cli.getArgs();
+ String command = args[0];
+ args = Arrays.copyOfRange(args, 1, args.length);
+ switch (command) {
+ case "generate":
+ GenerateVariants.main(args);
+ break;
+ case "scan":
+ ScanVariants.main(args);
+ break;
+ case "read-all":
+ FullReadBenchmark.main(args);
+ break;
+ case "read-some":
+ ColumnProjectionBenchmark.main(args);
+ break;
+ default:
+ System.err.println("Unknown command " + command);
+ System.exit(1);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/48ba9241/java/bench/src/java/org/apache/orc/bench/FullReadBenchmark.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/FullReadBenchmark.java b/java/bench/src/java/org/apache/orc/bench/FullReadBenchmark.java
new file mode 100644
index 0000000..952f18d
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/FullReadBenchmark.java
@@ -0,0 +1,223 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench;
+
+import com.google.gson.JsonStreamParser;
+import org.apache.avro.file.DataFileReader;
+import org.apache.avro.generic.GenericDatumReader;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.avro.io.DatumReader;
+import org.apache.avro.mapred.FsInput;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.TrackingLocalFileSystem;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport;
+import org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper;
+import org.apache.hadoop.io.ArrayWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.mapred.FileSplit;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.orc.OrcFile;
+import org.apache.orc.Reader;
+import org.apache.orc.RecordReader;
+import org.apache.orc.TypeDescription;
+import org.apache.parquet.hadoop.ParquetInputFormat;
+import org.openjdk.jmh.annotations.AuxCounters;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.TearDown;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.runner.Runner;
+import org.openjdk.jmh.runner.options.OptionsBuilder;
+
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.net.URI;
+import java.nio.charset.StandardCharsets;
+import java.util.concurrent.TimeUnit;
+
+@BenchmarkMode(Mode.AverageTime)
+@Warmup(iterations=1, time=10, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations=3, time=10, timeUnit = TimeUnit.SECONDS)
+@State(Scope.Thread)
+@OutputTimeUnit(TimeUnit.MICROSECONDS)
+@Fork(1)
+public class FullReadBenchmark {
+
+ private static final String ROOT_ENVIRONMENT_NAME = "bench.root.dir";
+ private static final Path root;
+ static {
+ String value = System.getProperty(ROOT_ENVIRONMENT_NAME);
+ root = value == null ? null : new Path(value);
+ }
+
+ @Param({"taxi", "sales", "github"})
+ public String dataset;
+
+ @Param({"none", "zlib", "snappy"})
+ public String compression;
+
+ @AuxCounters
+ @State(Scope.Thread)
+ public static class ExtraCounters {
+ long bytesRead;
+ long reads;
+ long records;
+ long invocations;
+
+ @Setup(Level.Iteration)
+ public void clean() {
+ bytesRead = 0;
+ reads = 0;
+ records = 0;
+ invocations = 0;
+ }
+
+ @TearDown(Level.Iteration)
+ public void print() {
+ System.out.println();
+ System.out.println("Reads: " + reads);
+ System.out.println("Bytes: " + bytesRead);
+ System.out.println("Records: " + records);
+ System.out.println("Invocations: " + invocations);
+ }
+
+ public long kilobytes() {
+ return bytesRead / 1024;
+ }
+
+ public long records() {
+ return records;
+ }
+ }
+
+ @Benchmark
+ public void orc(ExtraCounters counters) throws Exception{
+ Configuration conf = new Configuration();
+ TrackingLocalFileSystem fs = new TrackingLocalFileSystem();
+ fs.initialize(new URI("file:///"), conf);
+ FileSystem.Statistics statistics = fs.getLocalStatistics();
+ statistics.reset();
+ OrcFile.ReaderOptions options = OrcFile.readerOptions(conf).filesystem(fs);
+ Path path = Utilities.getVariant(root, dataset, "orc", compression);
+ Reader reader = OrcFile.createReader(path, options);
+ TypeDescription schema = reader.getSchema();
+ RecordReader rows = reader.rows();
+ VectorizedRowBatch batch = schema.createRowBatch();
+ while (rows.nextBatch(batch)) {
+ counters.records += batch.size;
+ }
+ rows.close();
+ counters.bytesRead += statistics.getBytesRead();
+ counters.reads += statistics.getReadOps();
+ counters.invocations += 1;
+ }
+
+ @Benchmark
+ public void avro(ExtraCounters counters) throws Exception {
+ Configuration conf = new Configuration();
+ conf.set("fs.track.impl", TrackingLocalFileSystem.class.getName());
+ conf.set("fs.defaultFS", "track:///");
+ Path path = Utilities.getVariant(root, dataset, "avro", compression);
+ FileSystem.Statistics statistics = FileSystem.getStatistics("track:///",
+ TrackingLocalFileSystem.class);
+ statistics.reset();
+ FsInput file = new FsInput(path, conf);
+ DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
+ DataFileReader<GenericRecord> dataFileReader =
+ new DataFileReader<>(file, datumReader);
+ GenericRecord record = null;
+ while (dataFileReader.hasNext()) {
+ record = dataFileReader.next(record);
+ counters.records += 1;
+ }
+ counters.bytesRead += statistics.getBytesRead();
+ counters.reads += statistics.getReadOps();
+ counters.invocations += 1;
+ }
+
+ @Benchmark
+ public void parquet(ExtraCounters counters) throws Exception {
+ JobConf conf = new JobConf();
+ conf.set("fs.track.impl", TrackingLocalFileSystem.class.getName());
+ conf.set("fs.defaultFS", "track:///");
+ Path path = Utilities.getVariant(root, dataset, "parquet", compression);
+ FileSystem.Statistics statistics = FileSystem.getStatistics("track:///",
+ TrackingLocalFileSystem.class);
+ statistics.reset();
+ ParquetInputFormat<ArrayWritable> inputFormat =
+ new ParquetInputFormat<>(DataWritableReadSupport.class);
+
+ NullWritable nada = NullWritable.get();
+ FileSplit split = new FileSplit(path, 0, Long.MAX_VALUE, new String[]{});
+ org.apache.hadoop.mapred.RecordReader<NullWritable,ArrayWritable> recordReader =
+ new ParquetRecordReaderWrapper(inputFormat, split, conf, Reporter.NULL);
+ ArrayWritable value = recordReader.createValue();
+ while (recordReader.next(nada, value)) {
+ counters.records += 1;
+ }
+ recordReader.close();
+ counters.bytesRead += statistics.getBytesRead();
+ counters.reads += statistics.getReadOps();
+ counters.invocations += 1;
+ }
+
+ @Benchmark
+ public void json(ExtraCounters counters) throws Exception {
+ Configuration conf = new Configuration();
+ TrackingLocalFileSystem fs = new TrackingLocalFileSystem();
+ fs.initialize(new URI("file:///"), conf);
+ FileSystem.Statistics statistics = fs.getLocalStatistics();
+ statistics.reset();
+ Path path = Utilities.getVariant(root, dataset, "json", compression);
+ CompressionKind compress = CompressionKind.valueOf(compression);
+ InputStream input = compress.read(fs.open(path));
+ JsonStreamParser parser =
+ new JsonStreamParser(new InputStreamReader(input,
+ StandardCharsets.UTF_8));
+ while (parser.hasNext()) {
+ parser.next();
+ counters.records += 1;
+ }
+ counters.bytesRead += statistics.getBytesRead();
+ counters.reads += statistics.getReadOps();
+ counters.invocations += 1;
+ }
+
+ public static void main(String[] args) throws Exception {
+ new Runner(new OptionsBuilder()
+ .include(FullReadBenchmark.class.getSimpleName())
+ .addProfiler("hs_gc")
+ .jvmArgs("-server", "-Xms256m", "-Xmx2g",
+ "-D" + ROOT_ENVIRONMENT_NAME + "=" + args[0]).build()
+ ).run();
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/48ba9241/java/bench/src/java/org/apache/orc/bench/RandomGenerator.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/RandomGenerator.java b/java/bench/src/java/org/apache/orc/bench/RandomGenerator.java
new file mode 100644
index 0000000..dfe7d43
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/RandomGenerator.java
@@ -0,0 +1,524 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench;
+
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.TypeDescription;
+
+import java.nio.charset.StandardCharsets;
+import java.sql.Timestamp;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+public class RandomGenerator {
+ private final TypeDescription schema = TypeDescription.createStruct();
+ private final List<Field> fields = new ArrayList<>();
+ private final Random random;
+
+ public RandomGenerator(int seed) {
+ random = new Random(seed);
+ }
+
+ private abstract class ValueGenerator {
+ double nullProbability = 0;
+ abstract void generate(ColumnVector vector, int valueCount);
+ }
+
+ private class RandomBoolean extends ValueGenerator {
+ public void generate(ColumnVector v, int valueCount) {
+ LongColumnVector vector = (LongColumnVector) v;
+ for(int r=0; r < valueCount; ++r) {
+ if (nullProbability != 0 && random.nextDouble() < nullProbability) {
+ v.noNulls = false;
+ v.isNull[r] = true;
+ } else {
+ vector.vector[r] = random.nextInt(2);
+ }
+ }
+ }
+ }
+
+ private class RandomList extends ValueGenerator {
+ private final int minSize;
+ private final int sizeRange;
+ private final Field child;
+
+ public RandomList(int minSize, int maxSize, Field child) {
+ this.minSize = minSize;
+ this.sizeRange = maxSize - minSize + 1;
+ this.child = child;
+ }
+
+ public void generate(ColumnVector v, int valueCount) {
+ ListColumnVector vector = (ListColumnVector) v;
+ for(int r=0; r < valueCount; ++r) {
+ if (nullProbability != 0 && random.nextDouble() < nullProbability) {
+ v.noNulls = false;
+ v.isNull[r] = true;
+ } else {
+ vector.offsets[r] = vector.childCount;
+ vector.lengths[r] = random.nextInt(sizeRange) + minSize;
+ vector.childCount += vector.lengths[r];
+ }
+ }
+ vector.child.ensureSize(vector.childCount, false);
+ child.generator.generate(vector.child, vector.childCount);
+ }
+ }
+
+ private class RandomStruct extends ValueGenerator {
+ private final Field[] children;
+
+ public RandomStruct(Field[] children) {
+ this.children = children;
+ }
+
+ public void generate(ColumnVector v, int valueCount) {
+ StructColumnVector vector = (StructColumnVector) v;
+ for(int r=0; r < valueCount; ++r) {
+ if (nullProbability != 0 && random.nextDouble() < nullProbability) {
+ v.noNulls = false;
+ v.isNull[r] = true;
+ }
+ }
+ for(int c=0; c < children.length; ++c) {
+ children[c].generator.generate(vector.fields[c], valueCount);
+ }
+ }
+ }
+
+ private abstract class IntegerGenerator extends ValueGenerator {
+ private final long sign;
+ private final long mask;
+
+ private IntegerGenerator(TypeDescription.Category kind) {
+ int bits = getIntegerLength(kind);
+ mask = bits == 64 ? 0 : -1L << bits;
+ sign = 1L << (bits - 1);
+ }
+
+ protected void normalize(LongColumnVector vector, int valueCount) {
+ // make sure the value stays in range by sign extending it
+ for(int r=0; r < valueCount; ++r) {
+ if ((vector.vector[r] & sign) == 0) {
+ vector.vector[r] &= ~mask;
+ } else {
+ vector.vector[r] |= mask;
+ }
+ }
+ }
+ }
+
+ private class AutoIncrement extends IntegerGenerator {
+ private long value;
+ private final long increment;
+
+ private AutoIncrement(TypeDescription.Category kind, long start,
+ long increment) {
+ super(kind);
+ this.value = start;
+ this.increment = increment;
+ }
+
+ public void generate(ColumnVector v, int valueCount) {
+ LongColumnVector vector = (LongColumnVector) v;
+ for(int r=0; r < valueCount; ++r) {
+ if (nullProbability != 0 && random.nextDouble() >= nullProbability) {
+ v.noNulls = false;
+ v.isNull[r] = true;
+ } else {
+ vector.vector[r] = value;
+ value += increment;
+ }
+ }
+ normalize(vector, valueCount);
+ }
+ }
+
+ private class RandomInteger extends IntegerGenerator {
+
+ private RandomInteger(TypeDescription.Category kind) {
+ super(kind);
+ }
+
+ public void generate(ColumnVector v, int valueCount) {
+ LongColumnVector vector = (LongColumnVector) v;
+ for(int r=0; r < valueCount; ++r) {
+ if (nullProbability != 0 && random.nextDouble() < nullProbability) {
+ v.noNulls = false;
+ v.isNull[r] = true;
+ } else {
+ vector.vector[r] = random.nextLong();
+ }
+ }
+ normalize(vector, valueCount);
+ }
+ }
+
+ private class IntegerRange extends IntegerGenerator {
+ private final long minimum;
+ private final long range;
+ private final long limit;
+
+ private IntegerRange(TypeDescription.Category kind, long minimum,
+ long maximum) {
+ super(kind);
+ this.minimum = minimum;
+ this.range = maximum - minimum + 1;
+ if (this.range < 0) {
+ throw new IllegalArgumentException("Can't support a negative range "
+ + range);
+ }
+ limit = (Long.MAX_VALUE / range) * range;
+ }
+
+ public void generate(ColumnVector v, int valueCount) {
+ LongColumnVector vector = (LongColumnVector) v;
+ for(int r=0; r < valueCount; ++r) {
+ if (nullProbability != 0 && random.nextDouble() < nullProbability) {
+ v.noNulls = false;
+ v.isNull[r] = true;
+ } else {
+ long rand;
+ do {
+ // clear the sign bit
+ rand = random.nextLong() & Long.MAX_VALUE;
+ } while (rand >= limit);
+ vector.vector[r] = (rand % range) + minimum;
+ }
+ }
+ normalize(vector, valueCount);
+ }
+ }
+
+ private class StringChooser extends ValueGenerator {
+ private final byte[][] choices;
+ private StringChooser(String[] values) {
+ choices = new byte[values.length][];
+ for(int e=0; e < values.length; ++e) {
+ choices[e] = values[e].getBytes(StandardCharsets.UTF_8);
+ }
+ }
+
+ public void generate(ColumnVector v, int valueCount) {
+ BytesColumnVector vector = (BytesColumnVector) v;
+ for(int r=0; r < valueCount; ++r) {
+ if (nullProbability != 0 && random.nextDouble() < nullProbability) {
+ v.noNulls = false;
+ v.isNull[r] = true;
+ } else {
+ int val = random.nextInt(choices.length);
+ vector.setRef(r, choices[val], 0, choices[val].length);
+ }
+ }
+ }
+ }
+
+ private static byte[] concat(byte[] left, byte[] right) {
+ byte[] result = new byte[left.length + right.length];
+ System.arraycopy(left, 0, result, 0, left.length);
+ System.arraycopy(right, 0, result, left.length, right.length);
+ return result;
+ }
+
+ private static byte pickOne(byte[] choices, Random random) {
+ return choices[random.nextInt(choices.length)];
+ }
+
+ private static final byte[] LOWER_CONSONANTS =
+ "bcdfghjklmnpqrstvwxyz".getBytes(StandardCharsets.UTF_8);
+ private static final byte[] UPPER_CONSONANTS =
+ "BCDFGHJKLMNPQRSTVWXYZ".getBytes(StandardCharsets.UTF_8);
+ private static final byte[] CONSONANTS =
+ concat(LOWER_CONSONANTS, UPPER_CONSONANTS);
+ private static final byte[] LOWER_VOWELS = "aeiou".getBytes(StandardCharsets.UTF_8);
+ private static final byte[] UPPER_VOWELS = "AEIOU".getBytes(StandardCharsets.UTF_8);
+ private static final byte[] VOWELS = concat(LOWER_VOWELS, UPPER_VOWELS);
+ private static final byte[] LOWER_LETTERS =
+ concat(LOWER_CONSONANTS, LOWER_VOWELS);
+ private static final byte[] UPPER_LETTERS =
+ concat(UPPER_CONSONANTS, UPPER_VOWELS);
+ private static final byte[] LETTERS = concat(LOWER_LETTERS, UPPER_LETTERS);
+ private static final byte[] NATURAL_DIGITS = "123456789".getBytes(StandardCharsets.UTF_8);
+ private static final byte[] DIGITS = "0123456789".getBytes(StandardCharsets.UTF_8);
+
+ private class StringPattern extends ValueGenerator {
+ private final byte[] buffer;
+ private final byte[][] choices;
+ private final int[] locations;
+
+ private StringPattern(String pattern) {
+ buffer = pattern.getBytes(StandardCharsets.UTF_8);
+ int locs = 0;
+ for(int i=0; i < buffer.length; ++i) {
+ switch (buffer[i]) {
+ case 'C':
+ case 'c':
+ case 'E':
+ case 'V':
+ case 'v':
+ case 'F':
+ case 'l':
+ case 'L':
+ case 'D':
+ case 'x':
+ case 'X':
+ locs += 1;
+ break;
+ default:
+ break;
+ }
+ }
+ locations = new int[locs];
+ choices = new byte[locs][];
+ locs = 0;
+ for(int i=0; i < buffer.length; ++i) {
+ switch (buffer[i]) {
+ case 'C':
+ locations[locs] = i;
+ choices[locs++] = UPPER_CONSONANTS;
+ break;
+ case 'c':
+ locations[locs] = i;
+ choices[locs++] = LOWER_CONSONANTS;
+ break;
+ case 'E':
+ locations[locs] = i;
+ choices[locs++] = CONSONANTS;
+ break;
+ case 'V':
+ locations[locs] = i;
+ choices[locs++] = UPPER_VOWELS;
+ break;
+ case 'v':
+ locations[locs] = i;
+ choices[locs++] = LOWER_VOWELS;
+ break;
+ case 'F':
+ locations[locs] = i;
+ choices[locs++] = VOWELS;
+ break;
+ case 'l':
+ locations[locs] = i;
+ choices[locs++] = LOWER_LETTERS;
+ break;
+ case 'L':
+ locations[locs] = i;
+ choices[locs++] = UPPER_LETTERS;
+ break;
+ case 'D':
+ locations[locs] = i;
+ choices[locs++] = LETTERS;
+ break;
+ case 'x':
+ locations[locs] = i;
+ choices[locs++] = NATURAL_DIGITS;
+ break;
+ case 'X':
+ locations[locs] = i;
+ choices[locs++] = DIGITS;
+ break;
+ default:
+ break;
+ }
+ }
+ }
+
+ public void generate(ColumnVector v, int valueCount) {
+ BytesColumnVector vector = (BytesColumnVector) v;
+ for(int r=0; r < valueCount; ++r) {
+ if (nullProbability != 0 && random.nextDouble() < nullProbability) {
+ v.noNulls = false;
+ v.isNull[r] = true;
+ } else {
+ for(int m=0; m < locations.length; ++m) {
+ buffer[locations[m]] = pickOne(choices[m], random);
+ }
+ vector.setVal(r, buffer, 0, buffer.length);
+ }
+ }
+ }
+ }
+
+ private class TimestampRange extends ValueGenerator {
+ private final long minimum;
+ private final long range;
+ private final long limit;
+
+ private TimestampRange(String min, String max) {
+ minimum = Timestamp.valueOf(min).getTime();
+ range = Timestamp.valueOf(max).getTime() - minimum + 1;
+ if (range < 0) {
+ throw new IllegalArgumentException("Negative range " + range);
+ }
+ limit = (Long.MAX_VALUE / range) * range;
+ }
+
+ public void generate(ColumnVector v, int valueCount) {
+ TimestampColumnVector vector = (TimestampColumnVector) v;
+ for(int r=0; r < valueCount; ++r) {
+ if (nullProbability != 0 && random.nextDouble() < nullProbability) {
+ v.noNulls = false;
+ v.isNull[r] = true;
+ } else {
+ long rand;
+ do {
+ // clear the sign bit
+ rand = random.nextLong() & Long.MAX_VALUE;
+ } while (rand >= limit);
+ vector.time[r] = (rand % range) + minimum;
+ vector.nanos[r] = random.nextInt(1000000);
+ }
+ }
+ }
+ }
+
+ private static int getIntegerLength(TypeDescription.Category kind) {
+ switch (kind) {
+ case BYTE:
+ return 8;
+ case SHORT:
+ return 16;
+ case INT:
+ return 32;
+ case LONG:
+ return 64;
+ default:
+ throw new IllegalArgumentException("Unhandled type " + kind);
+ }
+ }
+
+ public class Field {
+ private final TypeDescription type;
+ private Field[] children;
+ private ValueGenerator generator;
+
+ private Field(TypeDescription type) {
+ this.type = type;
+ if (!type.getCategory().isPrimitive()) {
+ List<TypeDescription> childrenTypes = type.getChildren();
+ children = new Field[childrenTypes.size()];
+ for(int c=0; c < children.length; ++c) {
+ children[c] = new Field(childrenTypes.get(c));
+ }
+ }
+ }
+
+ public Field addAutoIncrement(long start, long increment) {
+ generator = new AutoIncrement(type.getCategory(), start, increment);
+ return this;
+ }
+
+ public Field addIntegerRange(long min, long max) {
+ generator = new IntegerRange(type.getCategory(), min, max);
+ return this;
+ }
+
+ public Field addRandomInt() {
+ generator = new RandomInteger(type.getCategory());
+ return this;
+ }
+
+ public Field addStringChoice(String... choices) {
+ if (type.getCategory() != TypeDescription.Category.STRING) {
+ throw new IllegalArgumentException("Must be string - " + type);
+ }
+ generator = new StringChooser(choices);
+ return this;
+ }
+
+ public Field addStringPattern(String pattern) {
+ if (type.getCategory() != TypeDescription.Category.STRING) {
+ throw new IllegalArgumentException("Must be string - " + type);
+ }
+ generator = new StringPattern(pattern);
+ return this;
+ }
+
+ public Field addTimestampRange(String start, String end) {
+ if (type.getCategory() != TypeDescription.Category.TIMESTAMP) {
+ throw new IllegalArgumentException("Must be timestamp - " + type);
+ }
+ generator = new TimestampRange(start, end);
+ return this;
+ }
+
+ public Field addBoolean() {
+ if (type.getCategory() != TypeDescription.Category.BOOLEAN) {
+ throw new IllegalArgumentException("Must be boolean - " + type);
+ }
+ generator = new RandomBoolean();
+ return this;
+ }
+
+ public Field hasNulls(double probability) {
+ generator.nullProbability = probability;
+ return this;
+ }
+
+ public Field addStruct() {
+ generator = new RandomStruct(children);
+ return this;
+ }
+
+ public Field addList(int minSize, int maxSize) {
+ generator = new RandomList(minSize, maxSize, children[0]);
+ return this;
+ }
+
+ public Field getChildField(int child) {
+ return children[child];
+ }
+ }
+
+ public Field addField(String name, TypeDescription.Category kind) {
+ TypeDescription type = new TypeDescription(kind);
+ return addField(name, type);
+ }
+
+ public Field addField(String name, TypeDescription type) {
+ schema.addField(name, type);
+ Field result = new Field(type);
+ fields.add(result);
+ return result;
+ }
+
+ public void generate(VectorizedRowBatch batch, int rowCount) {
+ batch.reset();
+ for(int c=0; c < batch.cols.length; ++c) {
+ fields.get(c).generator.generate(batch.cols[c], rowCount);
+ }
+ batch.size = rowCount;
+ }
+
+ /**
+ * Get the schema for the table that is being generated.
+ * @return
+ */
+ public TypeDescription getSchema() {
+ return schema;
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/48ba9241/java/bench/src/java/org/apache/orc/bench/SalesGenerator.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/SalesGenerator.java b/java/bench/src/java/org/apache/orc/bench/SalesGenerator.java
new file mode 100644
index 0000000..2be3537
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/SalesGenerator.java
@@ -0,0 +1,206 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench;
+
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.bench.convert.BatchReader;
+
+public class SalesGenerator implements BatchReader {
+ private final RandomGenerator generator;
+ private long rowsRemaining;
+ private final static double MOSTLY = 0.99999;
+
+ public SalesGenerator(long rows) {
+ this(rows, 42);
+ }
+
+ public SalesGenerator(long rows, int seed) {
+ generator = new RandomGenerator(seed);
+ // column 1
+ generator.addField("sales_id", TypeDescription.Category.LONG)
+ .addAutoIncrement(1000000000, 1);
+ generator.addField("customer_id", TypeDescription.Category.LONG)
+ .addIntegerRange(1000000000, 2000000000);
+ generator.addField("col3", TypeDescription.Category.LONG)
+ .addIntegerRange(1, 10000).hasNulls(0.9993100389335173);
+
+ // column 4
+ generator.addField("item_category", TypeDescription.Category.LONG)
+ .addIntegerRange(1, 1000000).hasNulls(0.00014784879996054823);
+ generator.addField("item_count", TypeDescription.Category.LONG)
+ .addIntegerRange(1, 1000);
+ generator.addField("change_ts", TypeDescription.Category.TIMESTAMP)
+ .addTimestampRange("2003-01-01 00:00:00", "2017-03-14 23:59:59");
+
+ // column 7
+ generator.addField("store_location", TypeDescription.Category.STRING)
+ .addStringChoice("Los Angeles", "New York", "Cupertino", "Sunnyvale",
+ "Boston", "Chicago", "Seattle", "Jackson",
+ "Palo Alto", "San Mateo", "San Jose", "Santa Clara",
+ "Irvine", "Torrance", "Gardena", "Hermosa", "Manhattan")
+ .hasNulls(0.0004928293332019384);
+ generator.addField("associate_id", TypeDescription.Category.STRING)
+ .addStringPattern("MR V").hasNulls(0.05026859198659506);
+ generator.addField("col9", TypeDescription.Category.LONG)
+ .addIntegerRange(1, 1000000000).hasNulls(MOSTLY);
+
+ // column 10
+ generator.addField("rebate_id", TypeDescription.Category.STRING)
+ .addStringPattern("xxxxxx").hasNulls(MOSTLY);
+ generator.addField("create_ts", TypeDescription.Category.TIMESTAMP)
+ .addTimestampRange("2003-01-01 00:00:00", "2017-03-14 23:59:59");
+ generator.addField("col13", TypeDescription.Category.LONG)
+ .addIntegerRange(1, 100000).hasNulls(MOSTLY);
+
+ // column 13
+ generator.addField("size", TypeDescription.Category.STRING)
+ .addStringChoice("Small", "Medium", "Large", "XL")
+ .hasNulls(0.9503720861465674);
+ generator.addField("col14", TypeDescription.Category.LONG)
+ .addIntegerRange(1, 100000);
+ generator.addField("fulfilled", TypeDescription.Category.BOOLEAN)
+ .addBoolean();
+
+ // column 16
+ generator.addField("global_id", TypeDescription.Category.STRING)
+ .addStringPattern("xxxxxxxxxxxxxxxx").hasNulls(0.021388793060962974);
+ generator.addField("col17", TypeDescription.Category.STRING)
+ .addStringPattern("L-xx").hasNulls(MOSTLY);
+ generator.addField("col18", TypeDescription.Category.STRING)
+ .addStringPattern("ll").hasNulls(MOSTLY);
+
+ // column 19
+ generator.addField("col19", TypeDescription.Category.LONG)
+ .addIntegerRange(1, 100000);
+ generator.addField("has_rebate", TypeDescription.Category.BOOLEAN)
+ .addBoolean();
+ RandomGenerator.Field list =
+ generator.addField("col21",
+ TypeDescription.fromString("array<struct<sub1:bigint,sub2:string," +
+ "sub3:string,sub4:bigint,sub5:bigint,sub6:string>>"))
+ .addList(0, 3)
+ .hasNulls(MOSTLY);
+ RandomGenerator.Field struct = list.getChildField(0).addStruct();
+ struct.getChildField(0).addIntegerRange(0, 10000000);
+ struct.getChildField(1).addStringPattern("VVVVV");
+ struct.getChildField(2).addStringPattern("VVVVVVVV");
+ struct.getChildField(3).addIntegerRange(0, 10000000);
+ struct.getChildField(4).addIntegerRange(0, 10000000);
+ struct.getChildField(5).addStringPattern("VVVVVVVV");
+
+ // column 38
+ generator.addField("vendor_id", TypeDescription.Category.STRING)
+ .addStringPattern("Lxxxxxx").hasNulls(0.1870780148834459);
+ generator.addField("country", TypeDescription.Category.STRING)
+ .addStringChoice("USA", "Germany", "Ireland", "Canada", "Mexico",
+ "Denmark").hasNulls(0.0004928293332019384);
+
+ // column 40
+ generator.addField("backend_version", TypeDescription.Category.STRING)
+ .addStringPattern("X.xx").hasNulls(0.0005913951998423039);
+ generator.addField("col41", TypeDescription.Category.LONG)
+ .addIntegerRange(1000000000, 100000000000L);
+ generator.addField("col42", TypeDescription.Category.LONG)
+ .addIntegerRange(1, 1000000000);
+
+ // column 43
+ generator.addField("col43", TypeDescription.Category.LONG)
+ .addIntegerRange(1000000000, 10000000000L).hasNulls(0.9763934749396284);
+ generator.addField("col44", TypeDescription.Category.LONG)
+ .addIntegerRange(1, 100000000);
+ generator.addField("col45", TypeDescription.Category.LONG)
+ .addIntegerRange(1, 100000000);
+
+ // column 46
+ generator.addField("col46", TypeDescription.Category.LONG)
+ .addIntegerRange(1, 10000000);
+ generator.addField("col47", TypeDescription.Category.LONG)
+ .addIntegerRange(1, 1000);
+ generator.addField("col48", TypeDescription.Category.LONG)
+ .addIntegerRange(1, 1000000).hasNulls(MOSTLY);
+
+ // column 49
+ generator.addField("col49", TypeDescription.Category.STRING)
+ .addStringPattern("xxxx").hasNulls(0.0004928293332019384);
+ generator.addField("col50", TypeDescription.Category.STRING)
+ .addStringPattern("ll").hasNulls(0.9496821250800848);
+ generator.addField("col51", TypeDescription.Category.LONG)
+ .addIntegerRange(1, 1000000).hasNulls(0.9999014341333596);
+
+ // column 52
+ generator.addField("col52", TypeDescription.Category.LONG)
+ .addIntegerRange(1, 1000000).hasNulls(0.9980779656005125);
+ generator.addField("col53", TypeDescription.Category.LONG)
+ .addIntegerRange(1, 1000000000);
+ generator.addField("col54", TypeDescription.Category.LONG)
+ .addIntegerRange(1, 1000000000);
+
+ // column 55
+ generator.addField("col55", TypeDescription.Category.STRING)
+ .addStringChoice("X");
+ generator.addField("col56", TypeDescription.Category.TIMESTAMP)
+ .addTimestampRange("2003-01-01 00:00:00", "2017-03-14 23:59:59");
+ generator.addField("col57", TypeDescription.Category.TIMESTAMP)
+ .addTimestampRange("2003-01-01 00:00:00", "2017-03-14 23:59:59");
+
+ // column 58
+ generator.addField("md5", TypeDescription.Category.LONG)
+ .addRandomInt();
+ generator.addField("col59", TypeDescription.Category.LONG)
+ .addIntegerRange(1000000000, 10000000000L);
+ generator.addField("col69", TypeDescription.Category.TIMESTAMP)
+ .addTimestampRange("2003-01-01 00:00:00", "2017-03-14 23:59:59")
+ .hasNulls(MOSTLY);
+
+ // column 61
+ generator.addField("col61", TypeDescription.Category.STRING)
+ .addStringPattern("X.xx").hasNulls(0.11399142476960233);
+ generator.addField("col62", TypeDescription.Category.STRING)
+ .addStringPattern("X.xx").hasNulls(0.9986200778670347);
+ generator.addField("col63", TypeDescription.Category.TIMESTAMP)
+ .addTimestampRange("2003-01-01 00:00:00", "2017-03-14 23:59:59");
+
+ // column 64
+ generator.addField("col64", TypeDescription.Category.LONG)
+ .addIntegerRange(1, 1000000).hasNulls(MOSTLY);
+ rowsRemaining = rows;
+ }
+
+ public boolean nextBatch(VectorizedRowBatch batch) {
+ int rows = (int) Math.min(batch.getMaxSize(), rowsRemaining);
+ generator.generate(batch, rows);
+ rowsRemaining -= rows;
+ return rows != 0;
+ }
+
+ @Override
+ public void close() {
+ // PASS
+ }
+
+ public TypeDescription getSchema() {
+ return generator.getSchema();
+ }
+
+ public static void main(String[] args) throws Exception {
+ SalesGenerator sales = new SalesGenerator(10, 42);
+ System.out.println("Schema " + sales.getSchema());
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/48ba9241/java/bench/src/java/org/apache/orc/bench/Utilities.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/Utilities.java b/java/bench/src/java/org/apache/orc/bench/Utilities.java
new file mode 100644
index 0000000..7016f5e
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/Utilities.java
@@ -0,0 +1,127 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.serde.serdeConstants;
+import org.apache.orc.TypeDescription;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Iterator;
+import java.util.List;
+import java.util.NoSuchElementException;
+import java.util.Properties;
+
+public class Utilities {
+
+ public static TypeDescription loadSchema(String name) throws IOException {
+ InputStream in = Utilities.class.getClassLoader().getResourceAsStream(name);
+ byte[] buffer= new byte[1 * 1024];
+ int len = in.read(buffer);
+ StringBuilder string = new StringBuilder();
+ while (len > 0) {
+ for(int i=0; i < len; ++i) {
+ // strip out
+ if (buffer[i] != '\n' && buffer[i] != ' ') {
+ string.append((char) buffer[i]);
+ }
+ }
+ len = in.read(buffer);
+ }
+ return TypeDescription.fromString(string.toString());
+ }
+
+ public static org.apache.orc.CompressionKind getCodec(CompressionKind compression) {
+ switch (compression) {
+ case NONE:
+ return org.apache.orc.CompressionKind.NONE;
+ case ZLIB:
+ return org.apache.orc.CompressionKind.ZLIB;
+ case SNAPPY:
+ return org.apache.orc.CompressionKind.SNAPPY;
+ default:
+ throw new IllegalArgumentException("Unknown compression " + compression);
+ }
+ }
+
+ public static Iterable<String> sliceArray(final String[] array,
+ final int start) {
+ return new Iterable<String>() {
+ String[] values = array;
+ int posn = start;
+
+ @Override
+ public Iterator<String> iterator() {
+ return new Iterator<String>() {
+ @Override
+ public boolean hasNext() {
+ return posn < values.length;
+ }
+
+ @Override
+ public String next() {
+ if (posn >= values.length) {
+ throw new NoSuchElementException("Index off end of array." +
+ " index = " + posn + " length = " + values.length);
+ } else {
+ return values[posn++];
+ }
+ }
+
+ @Override
+ public void remove() {
+ throw new UnsupportedOperationException("No remove");
+ }
+ };
+ }
+ };
+ }
+
+ public static Properties convertSchemaToHiveConfig(TypeDescription schema) {
+ Properties result = new Properties();
+ if (schema.getCategory() != TypeDescription.Category.STRUCT) {
+ throw new IllegalArgumentException("Hive requires struct root types" +
+ " instead of " + schema);
+ }
+ StringBuilder columns = new StringBuilder();
+ StringBuilder types = new StringBuilder();
+ List<String> columnNames = schema.getFieldNames();
+ List<TypeDescription> columnTypes = schema.getChildren();
+ for(int c=0; c < columnNames.size(); ++c) {
+ if (c != 0) {
+ columns.append(",");
+ types.append(",");
+ }
+ columns.append(columnNames.get(c));
+ types.append(columnTypes.get(c));
+ }
+ result.setProperty(serdeConstants.LIST_COLUMNS, columns.toString());
+ result.setProperty(serdeConstants.LIST_COLUMN_TYPES, types.toString());
+ return result;
+ }
+
+ public static Path getVariant(Path root,
+ String data,
+ String format,
+ String compress) {
+ return new Path(root, "generated/" + data + "/" + format + "." + compress);
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/48ba9241/java/bench/src/java/org/apache/orc/bench/convert/BatchReader.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/convert/BatchReader.java b/java/bench/src/java/org/apache/orc/bench/convert/BatchReader.java
new file mode 100644
index 0000000..b9ea356
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/convert/BatchReader.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench.convert;
+
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+
+import java.io.IOException;
+
+/**
+ * Generic interface for reading data.
+ */
+public interface BatchReader extends AutoCloseable {
+
+ boolean nextBatch(VectorizedRowBatch batch) throws IOException;
+
+ @Override
+ void close() throws IOException;
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/48ba9241/java/bench/src/java/org/apache/orc/bench/convert/BatchWriter.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/convert/BatchWriter.java b/java/bench/src/java/org/apache/orc/bench/convert/BatchWriter.java
new file mode 100644
index 0000000..c79d937
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/convert/BatchWriter.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench.convert;
+
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+
+import java.io.IOException;
+
+/**
+ * Generic interface for writing data.
+ */
+public interface BatchWriter extends AutoCloseable {
+
+ void writeBatch(VectorizedRowBatch batch) throws IOException;
+
+ @Override
+ void close() throws IOException;
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/48ba9241/java/bench/src/java/org/apache/orc/bench/convert/GenerateVariants.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/convert/GenerateVariants.java b/java/bench/src/java/org/apache/orc/bench/convert/GenerateVariants.java
new file mode 100644
index 0000000..7f57468
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/convert/GenerateVariants.java
@@ -0,0 +1,220 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench.convert;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.DefaultParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocatedFileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RemoteIterator;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.bench.CompressionKind;
+import org.apache.orc.bench.SalesGenerator;
+import org.apache.orc.bench.Utilities;
+import org.apache.orc.bench.convert.avro.AvroReader;
+import org.apache.orc.bench.convert.avro.AvroWriter;
+import org.apache.orc.bench.convert.csv.CsvReader;
+import org.apache.orc.bench.convert.json.JsonReader;
+import org.apache.orc.bench.convert.json.JsonWriter;
+import org.apache.orc.bench.convert.orc.OrcReader;
+import org.apache.orc.bench.convert.orc.OrcWriter;
+import org.apache.orc.bench.convert.parquet.ParquetReader;
+import org.apache.orc.bench.convert.parquet.ParquetWriter;
+
+import java.io.IOException;
+
+/**
+ * A tool to create the different variants that we need to benchmark against.
+ */
+public class GenerateVariants {
+
+ public static BatchWriter createFileWriter(Path file,
+ String format,
+ TypeDescription schema,
+ Configuration conf,
+ CompressionKind compress
+ ) throws IOException {
+ FileSystem fs = file.getFileSystem(conf);
+ fs.delete(file, false);
+ fs.mkdirs(file.getParent());
+ switch (format) {
+ case "json":
+ return new JsonWriter(file, schema, conf, compress);
+ case "orc":
+ return new OrcWriter(file, schema, conf, compress);
+ case "avro":
+ return new AvroWriter(file, schema, conf, compress);
+ case "parquet":
+ return new ParquetWriter(file, schema, conf, compress);
+ default:
+ throw new IllegalArgumentException("Unknown format " + format);
+ }
+ }
+
+ public static BatchReader createFileReader(Path file,
+ String format,
+ TypeDescription schema,
+ Configuration conf,
+ CompressionKind compress
+ ) throws IOException {
+ switch (format) {
+ case "csv":
+ return new CsvReader(file, schema, conf, compress);
+ case "json":
+ return new JsonReader(file, schema, conf, compress);
+ case "orc":
+ return new OrcReader(file, schema, conf);
+ case "avro":
+ return new AvroReader(file, schema, conf);
+ case "parquet":
+ return new ParquetReader(file, schema, conf);
+ default:
+ throw new IllegalArgumentException("Unknown format " + format);
+ }
+ }
+
+ static class RecursiveReader implements BatchReader {
+ private final RemoteIterator<LocatedFileStatus> filenames;
+ private final String format;
+ private final TypeDescription schema;
+ private final Configuration conf;
+ private final CompressionKind compress;
+ private BatchReader current = null;
+
+ RecursiveReader(Path root,
+ String format,
+ TypeDescription schema,
+ Configuration conf,
+ CompressionKind compress) throws IOException {
+ FileSystem fs = root.getFileSystem(conf);
+ filenames = fs.listFiles(root, true);
+ this.format = format;
+ this.schema = schema;
+ this.conf = conf;
+ this.compress = compress;
+ }
+
+ @Override
+ public boolean nextBatch(VectorizedRowBatch batch) throws IOException {
+ while (current == null || !current.nextBatch(batch)) {
+ if (filenames.hasNext()) {
+ LocatedFileStatus next = filenames.next();
+ if (next.isFile()) {
+ current = createFileReader(next.getPath(), format, schema, conf,
+ compress);
+ }
+ } else {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ @Override
+ public void close() throws IOException {
+ if (current != null) {
+ current.close();
+ }
+ }
+ }
+
+ public static BatchReader createReader(Path root,
+ String dataName,
+ TypeDescription schema,
+ Configuration conf,
+ long salesRecords) throws IOException {
+ switch (dataName) {
+ case "taxi":
+ return new RecursiveReader(new Path(root, "sources/" + dataName), "csv",
+ schema, conf, CompressionKind.ZLIB);
+ case "sales":
+ return new SalesGenerator(salesRecords);
+ case "github":
+ return new RecursiveReader(new Path(root, "sources/" + dataName), "json",
+ schema, conf, CompressionKind.ZLIB);
+ default:
+ throw new IllegalArgumentException("Unknown data name " + dataName);
+ }
+ }
+
+ static CommandLine parseCommandLine(String[] args) throws ParseException {
+ Options options = new Options()
+ .addOption("h", "help", false, "Provide help")
+ .addOption("c", "compress", true, "List of compression")
+ .addOption("d", "data", true, "List of data sets")
+ .addOption("f", "format", true, "List of formats")
+ .addOption("s", "sales", true, "Number of records for sales");
+ CommandLine result = new DefaultParser().parse(options, args);
+ if (result.hasOption("help") || result.getArgs().length == 0) {
+ new HelpFormatter().printHelp("convert <root>", options);
+ System.exit(1);
+ }
+ return result;
+ }
+
+ public static void main(String[] args) throws Exception {
+ CommandLine cli = parseCommandLine(args);
+ String[] compressList =
+ cli.getOptionValue("compress", "none,snappy,zlib").split(",");
+ String[] dataList =
+ cli.getOptionValue("data", "taxi,sales,github").split(",");
+ String[] formatList =
+ cli.getOptionValue("format", "avro,json,orc,parquet").split(",");
+ long records = Long.parseLong(cli.getOptionValue("sales", "25000000"));
+ Configuration conf = new Configuration();
+ Path root = new Path(cli.getArgs()[0]);
+ for(String data: dataList) {
+ // Set up the reader
+ TypeDescription schema = Utilities.loadSchema(data + ".schema");
+ BatchReader reader = createReader(root, data, schema, conf, records);
+
+ // Set up the writers for each combination
+ BatchWriter[] writers = new BatchWriter[compressList.length * formatList.length];
+ for(int compress=0; compress < compressList.length; ++compress) {
+ CompressionKind compressionKind =
+ CompressionKind.valueOf(compressList[compress].toUpperCase());
+ for(int format=0; format < formatList.length; ++format) {
+ Path outPath = Utilities.getVariant(root, data, formatList[format],
+ compressList[compress]);
+ writers[compress * formatList.length + format] =
+ createFileWriter(outPath, formatList[format], schema, conf,
+ compressionKind);
+ }
+ }
+
+ // Copy the rows
+ VectorizedRowBatch batch = schema.createRowBatch();
+ while (reader.nextBatch(batch)) {
+ for(BatchWriter writer: writers) {
+ writer.writeBatch(batch);
+ }
+ }
+ reader.close();
+ for(BatchWriter writer: writers) {
+ writer.close();
+ }
+ }
+ }
+}
[2/3] orc git commit: ORC-354. Restore the benchmark module.
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/orc/blob/48ba9241/java/bench/src/java/org/apache/orc/bench/convert/ScanVariants.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/convert/ScanVariants.java b/java/bench/src/java/org/apache/orc/bench/convert/ScanVariants.java
new file mode 100644
index 0000000..ae76238
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/convert/ScanVariants.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench.convert;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.DefaultParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.bench.CompressionKind;
+import org.apache.orc.bench.Utilities;
+
+/**
+ * A tool to create the different variants that we need to benchmark against.
+ */
+public class ScanVariants {
+
+
+ static CommandLine parseCommandLine(String[] args) throws ParseException {
+ Options options = new Options()
+ .addOption("h", "help", false, "Provide help")
+ .addOption("c", "compress", true, "List of compression")
+ .addOption("d", "data", true, "List of data sets")
+ .addOption("f", "format", true, "List of formats");
+ CommandLine result = new DefaultParser().parse(options, args);
+ if (result.hasOption("help") || result.getArgs().length == 0) {
+ new HelpFormatter().printHelp("scan <root>", options);
+ System.exit(1);
+ }
+ return result;
+ }
+
+ public static void main(String[] args) throws Exception {
+ CommandLine cli = parseCommandLine(args);
+ String[] compressList =
+ cli.getOptionValue("compress", "none,snappy,zlib").split(",");
+ String[] dataList =
+ cli.getOptionValue("data", "taxi,sales,github").split(",");
+ String[] formatList =
+ cli.getOptionValue("format", "avro,json,orc,parquet").split(",");
+ Configuration conf = new Configuration();
+ Path root = new Path(cli.getArgs()[0]);
+ for(String data: dataList) {
+ TypeDescription schema = Utilities.loadSchema(data + ".schema");
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for (String compress : compressList) {
+ CompressionKind compressKind =
+ CompressionKind.valueOf(compress.toUpperCase());
+ for (String format : formatList) {
+ Path filename = Utilities.getVariant(root, data, format,
+ compress);
+ BatchReader reader = GenerateVariants.createFileReader(filename,
+ format, schema, conf, compressKind);
+ long rows = 0;
+ long batches = 0;
+ while (reader.nextBatch(batch)) {
+ batches += 1;
+ rows += batch.size;
+ }
+ System.out.println(filename + " rows: " + rows + " batches: "
+ + batches);
+ reader.close();
+ }
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/48ba9241/java/bench/src/java/org/apache/orc/bench/convert/avro/AvroReader.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/convert/avro/AvroReader.java b/java/bench/src/java/org/apache/orc/bench/convert/avro/AvroReader.java
new file mode 100644
index 0000000..fc354d6
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/convert/avro/AvroReader.java
@@ -0,0 +1,299 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench.convert.avro;
+
+import org.apache.avro.file.DataFileReader;
+import org.apache.avro.generic.GenericData;
+import org.apache.avro.generic.GenericDatumReader;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.avro.io.DatumReader;
+import org.apache.avro.mapred.FsInput;
+import org.apache.avro.util.Utf8;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.bench.convert.BatchReader;
+
+import java.io.IOException;
+import java.math.BigInteger;
+import java.nio.ByteBuffer;
+import java.util.List;
+
+public class AvroReader implements BatchReader {
+ private final DataFileReader<GenericRecord> dataFileReader;
+ private GenericRecord record = null;
+ private final AvroConverter[] converters;
+
+ public AvroReader(Path path,
+ TypeDescription schema,
+ Configuration conf) throws IOException {
+ FsInput file = new FsInput(path, conf);
+ DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
+ dataFileReader = new DataFileReader<>(file, datumReader);
+ List<TypeDescription> children = schema.getChildren();
+ converters = new AvroConverter[children.size()];
+ for(int c=0; c < converters.length; ++c) {
+ converters[c] = createConverter(children.get(c));
+ }
+ }
+
+ @Override
+ public boolean nextBatch(VectorizedRowBatch batch) throws IOException {
+ batch.reset();
+ int maxSize = batch.getMaxSize();
+ while (dataFileReader.hasNext() && batch.size < maxSize) {
+ record = dataFileReader.next(record);
+ int row = batch.size++;
+ for(int c=0; c < converters.length; ++c) {
+ converters[c].convert(batch.cols[c], row, record.get(c));
+ }
+ }
+ return batch.size != 0;
+ }
+
+ @Override
+ public void close() throws IOException {
+ dataFileReader.close();
+ }
+
+ interface AvroConverter {
+ void convert(ColumnVector vector, int row, Object value);
+ }
+
+ private static class BooleanConverter implements AvroConverter {
+ public void convert(ColumnVector cv, int row, Object value) {
+ if (value == null) {
+ cv.noNulls = false;
+ cv.isNull[row] = true;
+ } else {
+ ((LongColumnVector) cv).vector[row] =
+ ((Boolean) value).booleanValue() ? 1 : 0;
+ }
+ }
+ }
+
+ private static class IntConverter implements AvroConverter {
+ public void convert(ColumnVector cv, int row, Object value) {
+ if (value == null) {
+ cv.noNulls = false;
+ cv.isNull[row] = true;
+ } else {
+ ((LongColumnVector) cv).vector[row] =
+ ((Integer) value).intValue();
+ }
+ }
+ }
+
+ private static class LongConverter implements AvroConverter {
+ public void convert(ColumnVector cv, int row, Object value) {
+ if (value == null) {
+ cv.noNulls = false;
+ cv.isNull[row] = true;
+ } else {
+ ((LongColumnVector) cv).vector[row] =
+ ((Long) value).longValue();
+ }
+ }
+ }
+
+ private static class FloatConverter implements AvroConverter {
+ public void convert(ColumnVector cv, int row, Object value) {
+ if (value == null) {
+ cv.noNulls = false;
+ cv.isNull[row] = true;
+ } else {
+ ((DoubleColumnVector) cv).vector[row] =
+ ((Float) value).floatValue();
+ }
+ }
+ }
+
+ private static class DoubleConverter implements AvroConverter {
+ public void convert(ColumnVector cv, int row, Object value) {
+ if (value == null) {
+ cv.noNulls = false;
+ cv.isNull[row] = true;
+ } else {
+ ((DoubleColumnVector) cv).vector[row] =
+ ((Double) value).doubleValue();
+ }
+ }
+ }
+
+ private static class StringConverter implements AvroConverter {
+ public void convert(ColumnVector cv, int row, Object value) {
+ if (value == null) {
+ cv.noNulls = false;
+ cv.isNull[row] = true;
+ } else {
+ byte[] bytes = ((Utf8) value).getBytes();
+ ((BytesColumnVector) cv).setRef(row, bytes, 0, bytes.length);
+ }
+ }
+ }
+
+ private static class BinaryConverter implements AvroConverter {
+ public void convert(ColumnVector cv, int row, Object value) {
+ if (value == null) {
+ cv.noNulls = false;
+ cv.isNull[row] = true;
+ } else {
+ ByteBuffer buf = (ByteBuffer) value;
+ ((BytesColumnVector) cv).setVal(row, buf.array(), buf.arrayOffset(),
+ buf.remaining());
+ }
+ }
+ }
+
+ private static class TimestampConverter implements AvroConverter {
+ public void convert(ColumnVector cv, int row, Object value) {
+ if (value == null) {
+ cv.noNulls = false;
+ cv.isNull[row] = true;
+ } else {
+ TimestampColumnVector tc = (TimestampColumnVector) cv;
+ tc.time[row] = ((Long) value).longValue();
+ tc.nanos[row] = 0;
+ }
+ }
+ }
+
+ private static class DecimalConverter implements AvroConverter {
+ final int scale;
+ DecimalConverter(int scale) {
+ this.scale = scale;
+ }
+ public void convert(ColumnVector cv, int row, Object value) {
+ if (value == null) {
+ cv.noNulls = false;
+ cv.isNull[row] = true;
+ } else {
+ DecimalColumnVector tc = (DecimalColumnVector) cv;
+ tc.vector[row].set(getHiveDecimalFromByteBuffer((ByteBuffer) value, scale));
+ }
+ }
+ }
+
+ private static class ListConverter implements AvroConverter {
+ final AvroConverter childConverter;
+
+ ListConverter(TypeDescription schema) {
+ childConverter = createConverter(schema.getChildren().get(0));
+ }
+
+ public void convert(ColumnVector cv, int row, Object value) {
+ if (value == null) {
+ cv.noNulls = false;
+ cv.isNull[row] = true;
+ } else {
+ ListColumnVector tc = (ListColumnVector) cv;
+ GenericData.Array array = (GenericData.Array) value;
+ int start = tc.childCount;
+ int len = array.size();
+ tc.childCount += len;
+ tc.child.ensureSize(tc.childCount, true);
+ for(int i=0; i < len; ++i) {
+ childConverter.convert(tc.child, start + i, array.get(i));
+ }
+ }
+ }
+ }
+
+ private static class StructConverter implements AvroConverter {
+ final AvroConverter[] childConverters;
+
+ StructConverter(TypeDescription schema) {
+ List<TypeDescription> children = schema.getChildren();
+ childConverters = new AvroConverter[children.size()];
+ for(int i=0; i < childConverters.length; ++i) {
+ childConverters[i] = createConverter(children.get(i));
+ }
+ }
+
+ public void convert(ColumnVector cv, int row, Object value) {
+ if (value == null) {
+ cv.noNulls = false;
+ cv.isNull[row] = true;
+ } else {
+ StructColumnVector tc = (StructColumnVector) cv;
+ GenericData.Record record = (GenericData.Record) value;
+ for(int c=0; c < tc.fields.length; ++c) {
+ childConverters[c].convert(tc.fields[c], row, record.get(c));
+ }
+ }
+ }
+ }
+
+ static AvroConverter createConverter(TypeDescription types) {
+ switch (types.getCategory()) {
+ case BINARY:
+ return new BinaryConverter();
+ case BOOLEAN:
+ return new BooleanConverter();
+ case BYTE:
+ case SHORT:
+ case INT:
+ return new IntConverter();
+ case LONG:
+ return new LongConverter();
+ case FLOAT:
+ return new FloatConverter();
+ case DOUBLE:
+ return new DoubleConverter();
+ case CHAR:
+ case VARCHAR:
+ case STRING:
+ return new StringConverter();
+ case TIMESTAMP:
+ return new TimestampConverter();
+ case DECIMAL:
+ return new DecimalConverter(types.getScale());
+ case LIST:
+ return new ListConverter(types);
+ case STRUCT:
+ return new StructConverter(types);
+ default:
+ throw new IllegalArgumentException("Unhandled type " + types);
+ }
+ }
+
+ static byte[] getBytesFromByteBuffer(ByteBuffer byteBuffer) {
+ byteBuffer.rewind();
+ byte[] result = new byte[byteBuffer.limit()];
+ byteBuffer.get(result);
+ return result;
+ }
+
+ static HiveDecimal getHiveDecimalFromByteBuffer(ByteBuffer byteBuffer,
+ int scale) {
+ byte[] result = getBytesFromByteBuffer(byteBuffer);
+ HiveDecimal dec = HiveDecimal.create(new BigInteger(result), scale);
+ return dec;
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/48ba9241/java/bench/src/java/org/apache/orc/bench/convert/avro/AvroSchemaUtils.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/convert/avro/AvroSchemaUtils.java b/java/bench/src/java/org/apache/orc/bench/convert/avro/AvroSchemaUtils.java
new file mode 100644
index 0000000..6c72a0e
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/convert/avro/AvroSchemaUtils.java
@@ -0,0 +1,192 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.bench.convert.avro;
+
+import org.apache.avro.Schema;
+import org.apache.orc.TypeDescription;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * GenerateVariants Hive TypeInfo to an Avro Schema
+ */
+public class AvroSchemaUtils {
+
+ private AvroSchemaUtils() {
+ // No instances
+ }
+
+ public static Schema createAvroSchema(TypeDescription typeInfo) {
+ Schema schema;
+ switch (typeInfo.getCategory()) {
+ case STRING:
+ schema = Schema.create(Schema.Type.STRING);
+ break;
+ case CHAR:
+ schema = getSchemaFor("{" +
+ "\"type\":\"string\"," +
+ "\"logicalType\":\"char\"," +
+ "\"maxLength\":" + typeInfo.getMaxLength() + "}");
+ break;
+ case VARCHAR:
+ schema = getSchemaFor("{" +
+ "\"type\":\"string\"," +
+ "\"logicalType\":\"varchar\"," +
+ "\"maxLength\":" + typeInfo.getMaxLength() + "}");
+ break;
+ case BINARY:
+ schema = Schema.create(Schema.Type.BYTES);
+ break;
+ case BYTE:
+ schema = Schema.create(Schema.Type.INT);
+ break;
+ case SHORT:
+ schema = Schema.create(Schema.Type.INT);
+ break;
+ case INT:
+ schema = Schema.create(Schema.Type.INT);
+ break;
+ case LONG:
+ schema = Schema.create(Schema.Type.LONG);
+ break;
+ case FLOAT:
+ schema = Schema.create(Schema.Type.FLOAT);
+ break;
+ case DOUBLE:
+ schema = Schema.create(Schema.Type.DOUBLE);
+ break;
+ case BOOLEAN:
+ schema = Schema.create(Schema.Type.BOOLEAN);
+ break;
+ case DECIMAL:
+ String precision = String.valueOf(typeInfo.getPrecision());
+ String scale = String.valueOf(typeInfo.getScale());
+ schema = getSchemaFor("{" +
+ "\"type\":\"bytes\"," +
+ "\"logicalType\":\"decimal\"," +
+ "\"precision\":" + precision + "," +
+ "\"scale\":" + scale + "}");
+ break;
+ case DATE:
+ schema = getSchemaFor("{" +
+ "\"type\":\"int\"," +
+ "\"logicalType\":\"date\"}");
+ break;
+ case TIMESTAMP:
+ schema = getSchemaFor("{" +
+ "\"type\":\"long\"," +
+ "\"logicalType\":\"timestamp-millis\"}");
+ break;
+ case LIST:
+ schema = createAvroArray(typeInfo);
+ break;
+ case MAP:
+ schema = createAvroMap(typeInfo);
+ break;
+ case STRUCT:
+ schema = createAvroRecord(typeInfo);
+ break;
+ case UNION:
+ schema = createAvroUnion(typeInfo);
+ break;
+ default:
+ throw new UnsupportedOperationException(typeInfo + " is not supported.");
+ }
+
+ return schema;
+ }
+
+ private static Schema createAvroUnion(TypeDescription typeInfo) {
+ List<Schema> childSchemas = new ArrayList<>();
+ for (TypeDescription childTypeInfo : typeInfo.getChildren()) {
+ Schema childSchema = createAvroSchema(childTypeInfo);
+ if (childSchema.getType() == Schema.Type.UNION) {
+ for (Schema grandkid: childSchema.getTypes()) {
+ if (childSchema.getType() != Schema.Type.NULL) {
+ childSchemas.add(grandkid);
+ }
+ }
+ } else {
+ childSchemas.add(childSchema);
+ }
+ }
+
+ return wrapInUnionWithNull(Schema.createUnion(childSchemas));
+ }
+
+ private static Schema createAvroRecord(TypeDescription typeInfo) {
+ List<Schema.Field> childFields = new ArrayList<>();
+
+ List<String> fieldNames = typeInfo.getFieldNames();
+ List<TypeDescription> fieldTypes = typeInfo.getChildren();
+
+ for (int i = 0; i < fieldNames.size(); ++i) {
+ TypeDescription childTypeInfo = fieldTypes.get(i);
+ Schema.Field field = new Schema.Field(fieldNames.get(i),
+ wrapInUnionWithNull(createAvroSchema(childTypeInfo)),
+ childTypeInfo.toString(),
+ (Object) null);
+ childFields.add(field);
+ }
+
+ Schema recordSchema = Schema.createRecord("record_" + typeInfo.getId(),
+ typeInfo.toString(), null, false);
+ recordSchema.setFields(childFields);
+ return recordSchema;
+ }
+
+ private static Schema createAvroMap(TypeDescription typeInfo) {
+ TypeDescription keyTypeInfo = typeInfo.getChildren().get(0);
+ if (keyTypeInfo.getCategory() != TypeDescription.Category.STRING) {
+ throw new UnsupportedOperationException("Avro only supports maps with string keys "
+ + typeInfo);
+ }
+
+ Schema valueSchema = wrapInUnionWithNull(createAvroSchema
+ (typeInfo.getChildren().get(1)));
+
+ return Schema.createMap(valueSchema);
+ }
+
+ private static Schema createAvroArray(TypeDescription typeInfo) {
+ Schema child = createAvroSchema(typeInfo.getChildren().get(0));
+ return Schema.createArray(wrapInUnionWithNull(child));
+ }
+
+ private static Schema wrapInUnionWithNull(Schema schema) {
+ Schema NULL = Schema.create(Schema.Type.NULL);
+ switch (schema.getType()) {
+ case NULL:
+ return schema;
+ case UNION:
+ List<Schema> kids = schema.getTypes();
+ List<Schema> newKids = new ArrayList<>(kids.size() + 1);
+ newKids.add(NULL);
+ return Schema.createUnion(newKids);
+ default:
+ return Schema.createUnion(Arrays.asList(NULL, schema));
+ }
+ }
+
+ private static Schema getSchemaFor(String str) {
+ Schema.Parser parser = new Schema.Parser();
+ return parser.parse(str);
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/48ba9241/java/bench/src/java/org/apache/orc/bench/convert/avro/AvroWriter.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/convert/avro/AvroWriter.java b/java/bench/src/java/org/apache/orc/bench/convert/avro/AvroWriter.java
new file mode 100644
index 0000000..44defbf
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/convert/avro/AvroWriter.java
@@ -0,0 +1,363 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench.convert.avro;
+
+import org.apache.avro.Schema;
+import org.apache.avro.file.CodecFactory;
+import org.apache.avro.file.DataFileWriter;
+import org.apache.avro.generic.GenericData;
+import org.apache.avro.generic.GenericDatumWriter;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.bench.convert.BatchWriter;
+import org.apache.orc.bench.CompressionKind;
+
+import java.io.IOException;
+import java.nio.Buffer;
+import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
+import java.util.List;
+
+public class AvroWriter implements BatchWriter {
+
+ interface AvroConverter {
+ Object convert(ColumnVector vector, int row);
+ }
+
+ private static class BooleanConverter implements AvroConverter {
+ public Object convert(ColumnVector cv, int row) {
+ if (cv.isRepeating) {
+ row = 0;
+ }
+ if (cv.noNulls || !cv.isNull[row]) {
+ LongColumnVector vector = (LongColumnVector) cv;
+ return vector.vector[row] != 0;
+ } else {
+ return null;
+ }
+ }
+ }
+
+ private static class IntConverter implements AvroConverter {
+ public Object convert(ColumnVector cv, int row) {
+ if (cv.isRepeating) {
+ row = 0;
+ }
+ if (cv.noNulls || !cv.isNull[row]) {
+ LongColumnVector vector = (LongColumnVector) cv;
+ return (int) vector.vector[row];
+ } else {
+ return null;
+ }
+ }
+ }
+
+ private static class LongConverter implements AvroConverter {
+ public Object convert(ColumnVector cv, int row) {
+ if (cv.isRepeating) {
+ row = 0;
+ }
+ if (cv.noNulls || !cv.isNull[row]) {
+ LongColumnVector vector = (LongColumnVector) cv;
+ return vector.vector[row];
+ } else {
+ return null;
+ }
+ }
+ }
+
+ private static class FloatConverter implements AvroConverter {
+ public Object convert(ColumnVector cv, int row) {
+ if (cv.isRepeating) {
+ row = 0;
+ }
+ if (cv.noNulls || !cv.isNull[row]) {
+ DoubleColumnVector vector = (DoubleColumnVector) cv;
+ return (float) vector.vector[row];
+ } else {
+ return null;
+ }
+ }
+ }
+
+ private static class DoubleConverter implements AvroConverter {
+ public Object convert(ColumnVector cv, int row) {
+ if (cv.isRepeating) {
+ row = 0;
+ }
+ if (cv.noNulls || !cv.isNull[row]) {
+ DoubleColumnVector vector = (DoubleColumnVector) cv;
+ return vector.vector[row];
+ } else {
+ return null;
+ }
+ }
+ }
+
+ private static class StringConverter implements AvroConverter {
+ public Object convert(ColumnVector cv, int row) {
+ if (cv.isRepeating) {
+ row = 0;
+ }
+ if (cv.noNulls || !cv.isNull[row]) {
+ BytesColumnVector vector = (BytesColumnVector) cv;
+ return new String(vector.vector[row], vector.start[row],
+ vector.length[row], StandardCharsets.UTF_8);
+ } else {
+ return null;
+ }
+ }
+ }
+
+ private static class BinaryConverter implements AvroConverter {
+ public Object convert(ColumnVector cv, int row) {
+ if (cv.isRepeating) {
+ row = 0;
+ }
+ if (cv.noNulls || !cv.isNull[row]) {
+ BytesColumnVector vector = (BytesColumnVector) cv;
+ return ByteBuffer.wrap(vector.vector[row], vector.start[row],
+ vector.length[row]);
+ } else {
+ return null;
+ }
+ }
+ }
+
+ private static class TimestampConverter implements AvroConverter {
+ public Object convert(ColumnVector cv, int row) {
+ if (cv.isRepeating) {
+ row = 0;
+ }
+ if (cv.noNulls || !cv.isNull[row]) {
+ TimestampColumnVector vector = (TimestampColumnVector) cv;
+ return vector.time[row];
+ } else {
+ return null;
+ }
+ }
+ }
+
+ private static class DecimalConverter implements AvroConverter {
+ final int scale;
+ DecimalConverter(int scale) {
+ this.scale = scale;
+ }
+ public Object convert(ColumnVector cv, int row) {
+ if (cv.isRepeating) {
+ row = 0;
+ }
+ if (cv.noNulls || !cv.isNull[row]) {
+ DecimalColumnVector vector = (DecimalColumnVector) cv;
+ return getBufferFromDecimal(
+ vector.vector[row].getHiveDecimal(), scale);
+ } else {
+ return null;
+ }
+ }
+ }
+
+ private static class ListConverter implements AvroConverter {
+ final Schema avroSchema;
+ final AvroConverter childConverter;
+
+ ListConverter(TypeDescription schema, Schema avroSchema) {
+ this.avroSchema = avroSchema;
+ childConverter = createConverter(schema.getChildren().get(0),
+ removeNullable(avroSchema.getElementType()));
+ }
+
+ public Object convert(ColumnVector cv, int row) {
+ if (cv.isRepeating) {
+ row = 0;
+ }
+ if (cv.noNulls || !cv.isNull[row]) {
+ ListColumnVector vector = (ListColumnVector) cv;
+ int offset = (int) vector.offsets[row];
+ int length = (int) vector.lengths[row];
+ GenericData.Array result = new GenericData.Array(length, avroSchema);
+ for(int i=0; i < length; ++i) {
+ result.add(childConverter.convert(vector.child, offset + i));
+ }
+ return result;
+ } else {
+ return null;
+ }
+ }
+ }
+
+ private static class StructConverter implements AvroConverter {
+ final Schema avroSchema;
+ final AvroConverter[] childConverters;
+
+ StructConverter(TypeDescription schema, Schema avroSchema) {
+ this.avroSchema = avroSchema;
+ List<TypeDescription> childrenTypes = schema.getChildren();
+ childConverters = new AvroConverter[childrenTypes.size()];
+ List<Schema.Field> fields = avroSchema.getFields();
+ for(int f=0; f < childConverters.length; ++f) {
+ childConverters[f] = createConverter(childrenTypes.get(f),
+ removeNullable(fields.get(f).schema()));
+ }
+ }
+
+ public Object convert(ColumnVector cv, int row) {
+ if (cv.isRepeating) {
+ row = 0;
+ }
+ if (cv.noNulls || !cv.isNull[row]) {
+ StructColumnVector vector = (StructColumnVector) cv;
+ GenericData.Record result = new GenericData.Record(avroSchema);
+ for(int f=0; f < childConverters.length; ++f) {
+ result.put(f, childConverters[f].convert(vector.fields[f], row));
+ }
+ return result;
+ } else {
+ return null;
+ }
+ }
+ }
+
+ static AvroConverter createConverter(TypeDescription types,
+ Schema avroSchema) {
+ switch (types.getCategory()) {
+ case BINARY:
+ return new BinaryConverter();
+ case BOOLEAN:
+ return new BooleanConverter();
+ case BYTE:
+ case SHORT:
+ case INT:
+ return new IntConverter();
+ case LONG:
+ return new LongConverter();
+ case FLOAT:
+ return new FloatConverter();
+ case DOUBLE:
+ return new DoubleConverter();
+ case CHAR:
+ case VARCHAR:
+ case STRING:
+ return new StringConverter();
+ case TIMESTAMP:
+ return new TimestampConverter();
+ case DECIMAL:
+ return new DecimalConverter(types.getScale());
+ case LIST:
+ return new ListConverter(types, avroSchema);
+ case STRUCT:
+ return new StructConverter(types, avroSchema);
+ default:
+ throw new IllegalArgumentException("Unhandled type " + types);
+ }
+ }
+
+ /**
+ * Remove the union(null, ...) wrapper around the schema.
+ *
+ * All of the types in Hive are nullable and in Avro those are represented
+ * by wrapping each type in a union type with the void type.
+ * @param avro The avro type
+ * @return The avro type with the nullable layer removed
+ */
+ static Schema removeNullable(Schema avro) {
+ while (avro.getType() == Schema.Type.UNION) {
+ List<Schema> children = avro.getTypes();
+ if (children.size() == 2 &&
+ children.get(0).getType() == Schema.Type.NULL) {
+ avro = children.get(1);
+ } else {
+ break;
+ }
+ }
+ return avro;
+ }
+
+ private final AvroConverter[] converters;
+ private final DataFileWriter writer;
+ private final GenericRecord record;
+
+ public AvroWriter(Path path, TypeDescription schema,
+ Configuration conf,
+ CompressionKind compression) throws IOException {
+ List<TypeDescription> childTypes = schema.getChildren();
+ Schema avroSchema = AvroSchemaUtils.createAvroSchema(schema);
+ List<Schema.Field> avroFields = avroSchema.getFields();
+ converters = new AvroConverter[childTypes.size()];
+ for(int c=0; c < converters.length; ++c) {
+ converters[c] = createConverter(childTypes.get(c),
+ removeNullable(avroFields.get(c).schema()));
+ }
+ GenericDatumWriter gdw = new GenericDatumWriter(avroSchema);
+ writer = new DataFileWriter(gdw);
+ switch (compression) {
+ case NONE:
+ break;
+ case ZLIB:
+ writer.setCodec(CodecFactory.deflateCodec(-1));
+ break;
+ case SNAPPY:
+ writer.setCodec(CodecFactory.snappyCodec());
+ break;
+ default:
+ throw new IllegalArgumentException("Compression unsupported " + compression);
+ }
+ writer.create(avroSchema, path.getFileSystem(conf).create(path));
+ record = new GenericData.Record(avroSchema);
+ }
+
+ public void writeBatch(VectorizedRowBatch batch) throws IOException {
+ for(int r=0; r < batch.size; ++r) {
+ for(int f=0; f < batch.cols.length; ++f) {
+ record.put(f, converters[f].convert(batch.cols[f], r));
+ }
+ writer.append(record);
+ }
+ }
+
+ public void close() throws IOException {
+ writer.close();
+ }
+
+ static Buffer getBufferFromBytes(byte[] input) {
+ ByteBuffer bb = ByteBuffer.wrap(input);
+ return bb.rewind();
+ }
+
+ public static Buffer getBufferFromDecimal(HiveDecimal dec, int scale) {
+ if (dec == null) {
+ return null;
+ }
+
+ dec = dec.setScale(scale);
+ return getBufferFromBytes(dec.unscaledValue().toByteArray());
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/48ba9241/java/bench/src/java/org/apache/orc/bench/convert/csv/CsvReader.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/convert/csv/CsvReader.java b/java/bench/src/java/org/apache/orc/bench/convert/csv/CsvReader.java
new file mode 100644
index 0000000..3246e69
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/convert/csv/CsvReader.java
@@ -0,0 +1,175 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.bench.convert.csv;
+
+import org.apache.commons.csv.CSVFormat;
+import org.apache.commons.csv.CSVParser;
+import org.apache.commons.csv.CSVRecord;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.bench.CompressionKind;
+import org.apache.orc.bench.convert.BatchReader;
+
+import java.io.DataInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.sql.Timestamp;
+import java.util.Iterator;
+import java.util.List;
+import java.util.zip.GZIPInputStream;
+
+public class CsvReader implements BatchReader {
+ private final Iterator<CSVRecord> parser;
+ private final ColumnReader[] readers;
+
+ interface ColumnReader {
+ void read(String value, ColumnVector vect, int row);
+ }
+
+ static class LongColumnReader implements ColumnReader {
+ public void read(String value, ColumnVector vect, int row) {
+ if ("".equals(value)) {
+ vect.noNulls = false;
+ vect.isNull[row] = true;
+ } else {
+ LongColumnVector vector = (LongColumnVector) vect;
+ vector.vector[row] = Long.parseLong(value);
+ }
+ }
+ }
+
+ static class DoubleColumnReader implements ColumnReader {
+ public void read(String value, ColumnVector vect, int row) {
+ if ("".equals(value)) {
+ vect.noNulls = false;
+ vect.isNull[row] = true;
+ } else {
+ DoubleColumnVector vector = (DoubleColumnVector) vect;
+ vector.vector[row] = Double.parseDouble(value);
+ }
+ }
+ }
+
+ static class StringColumnReader implements ColumnReader {
+ public void read(String value, ColumnVector vect, int row) {
+ if ("".equals(value)) {
+ vect.noNulls = false;
+ vect.isNull[row] = true;
+ } else {
+ BytesColumnVector vector = (BytesColumnVector) vect;
+ byte[] bytes = value.getBytes(StandardCharsets.UTF_8);
+ vector.setRef(row, bytes, 0, bytes.length);
+ }
+ }
+ }
+
+ static class TimestampColumnReader implements ColumnReader {
+ public void read(String value, ColumnVector vect, int row) {
+ if ("".equals(value)) {
+ vect.noNulls = false;
+ vect.isNull[row] = true;
+ } else {
+ TimestampColumnVector vector = (TimestampColumnVector) vect;
+ vector.set(row, Timestamp.valueOf(value));
+ }
+ }
+ }
+
+ static class DecimalColumnReader implements ColumnReader {
+ public void read(String value, ColumnVector vect, int row) {
+ if ("".equals(value)) {
+ vect.noNulls = false;
+ vect.isNull[row] = true;
+ } else {
+ DecimalColumnVector vector = (DecimalColumnVector) vect;
+ vector.vector[row].set(HiveDecimal.create(value));
+ }
+ }
+ }
+
+ ColumnReader createReader(TypeDescription schema) {
+ switch (schema.getCategory()) {
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ return new LongColumnReader();
+ case FLOAT:
+ case DOUBLE:
+ return new DoubleColumnReader();
+ case CHAR:
+ case VARCHAR:
+ case STRING:
+ return new StringColumnReader();
+ case DECIMAL:
+ return new DecimalColumnReader();
+ case TIMESTAMP:
+ return new TimestampColumnReader();
+ default:
+ throw new IllegalArgumentException("Unhandled type " + schema);
+ }
+ }
+
+ public CsvReader(Path path,
+ TypeDescription schema,
+ Configuration conf,
+ CompressionKind compress) throws IOException {
+ FileSystem fs = path.getFileSystem(conf);
+ InputStream input = compress.read(fs.open(path));
+ parser = new CSVParser(new InputStreamReader(input, StandardCharsets.UTF_8),
+ CSVFormat.RFC4180.withHeader()).iterator();
+ List<TypeDescription> columnTypes = schema.getChildren();
+ readers = new ColumnReader[columnTypes.size()];
+ int c = 0;
+ for(TypeDescription columnType: columnTypes) {
+ readers[c++] = createReader(columnType);
+ }
+ }
+
+ public boolean nextBatch(VectorizedRowBatch batch) throws IOException {
+ batch.reset();
+ int maxSize = batch.getMaxSize();
+ while (parser.hasNext() && batch.size < maxSize) {
+ CSVRecord record = parser.next();
+ int c = 0;
+ for(String val: record) {
+ readers[c].read(val, batch.cols[c], batch.size);
+ c += 1;
+ }
+ batch.size++;
+ }
+ return batch.size != 0;
+ }
+
+ public void close() {
+ // PASS
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/48ba9241/java/bench/src/java/org/apache/orc/bench/convert/json/JsonReader.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/convert/json/JsonReader.java b/java/bench/src/java/org/apache/orc/bench/convert/json/JsonReader.java
new file mode 100644
index 0000000..b4ff3122
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/convert/json/JsonReader.java
@@ -0,0 +1,279 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.bench.convert.json;
+
+import com.google.gson.JsonArray;
+import com.google.gson.JsonElement;
+import com.google.gson.JsonObject;
+import com.google.gson.JsonStreamParser;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.bench.CompressionKind;
+import org.apache.orc.bench.convert.BatchReader;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.sql.Timestamp;
+import java.util.List;
+import java.util.zip.GZIPInputStream;
+
+public class JsonReader implements BatchReader {
+ private final TypeDescription schema;
+ private final JsonStreamParser parser;
+ private final JsonConverter[] converters;
+
+ public JsonReader(Path path,
+ TypeDescription schema,
+ Configuration conf,
+ CompressionKind compressionKind) throws IOException {
+ this.schema = schema;
+ FileSystem fs = path.getFileSystem(conf);
+ InputStream input = compressionKind.read(fs.open(path));
+ parser = new JsonStreamParser(new InputStreamReader(input,
+ StandardCharsets.UTF_8));
+ if (schema.getCategory() != TypeDescription.Category.STRUCT) {
+ throw new IllegalArgumentException("Root must be struct - " + schema);
+ }
+ List<TypeDescription> fieldTypes = schema.getChildren();
+ converters = new JsonConverter[fieldTypes.size()];
+ for(int c = 0; c < converters.length; ++c) {
+ converters[c] = createConverter(fieldTypes.get(c));
+ }
+ }
+
+ public boolean nextBatch(VectorizedRowBatch batch) throws IOException {
+ batch.reset();
+ int maxSize = batch.getMaxSize();
+ List<String> fieldNames = schema.getFieldNames();
+ while (parser.hasNext() && batch.size < maxSize) {
+ JsonObject elem = parser.next().getAsJsonObject();
+ for(int c=0; c < converters.length; ++c) {
+ // look up each field to see if it is in the input, otherwise
+ // set it to null.
+ JsonElement field = elem.get(fieldNames.get(c));
+ if (field == null) {
+ batch.cols[c].noNulls = false;
+ batch.cols[c].isNull[batch.size] = true;
+ } else {
+ converters[c].convert(field, batch.cols[c], batch.size);
+ }
+ }
+ batch.size++;
+ }
+ return batch.size != 0;
+ }
+
+ public void close() {
+ // PASS
+ }
+
+ interface JsonConverter {
+ void convert(JsonElement value, ColumnVector vect, int row);
+ }
+
+ static class BooleanColumnConverter implements JsonConverter {
+ public void convert(JsonElement value, ColumnVector vect, int row) {
+ if (value == null || value.isJsonNull()) {
+ vect.noNulls = false;
+ vect.isNull[row] = true;
+ } else {
+ LongColumnVector vector = (LongColumnVector) vect;
+ vector.vector[row] = value.getAsBoolean() ? 1 : 0;
+ }
+ }
+ }
+
+ static class LongColumnConverter implements JsonConverter {
+ public void convert(JsonElement value, ColumnVector vect, int row) {
+ if (value == null || value.isJsonNull()) {
+ vect.noNulls = false;
+ vect.isNull[row] = true;
+ } else {
+ LongColumnVector vector = (LongColumnVector) vect;
+ vector.vector[row] = value.getAsLong();
+ }
+ }
+ }
+
+ static class DoubleColumnConverter implements JsonConverter {
+ public void convert(JsonElement value, ColumnVector vect, int row) {
+ if (value == null || value.isJsonNull()) {
+ vect.noNulls = false;
+ vect.isNull[row] = true;
+ } else {
+ DoubleColumnVector vector = (DoubleColumnVector) vect;
+ vector.vector[row] = value.getAsDouble();
+ }
+ }
+ }
+
+ static class StringColumnConverter implements JsonConverter {
+ public void convert(JsonElement value, ColumnVector vect, int row) {
+ if (value == null || value.isJsonNull()) {
+ vect.noNulls = false;
+ vect.isNull[row] = true;
+ } else {
+ BytesColumnVector vector = (BytesColumnVector) vect;
+ byte[] bytes = value.getAsString().getBytes(StandardCharsets.UTF_8);
+ vector.setRef(row, bytes, 0, bytes.length);
+ }
+ }
+ }
+
+ static class BinaryColumnConverter implements JsonConverter {
+ public void convert(JsonElement value, ColumnVector vect, int row) {
+ if (value == null || value.isJsonNull()) {
+ vect.noNulls = false;
+ vect.isNull[row] = true;
+ } else {
+ BytesColumnVector vector = (BytesColumnVector) vect;
+ String binStr = value.getAsString();
+ byte[] bytes = new byte[binStr.length()/2];
+ for(int i=0; i < bytes.length; ++i) {
+ bytes[i] = (byte) Integer.parseInt(binStr.substring(i*2, i*2+2), 16);
+ }
+ vector.setRef(row, bytes, 0, bytes.length);
+ }
+ }
+ }
+
+ static class TimestampColumnConverter implements JsonConverter {
+ public void convert(JsonElement value, ColumnVector vect, int row) {
+ if (value == null || value.isJsonNull()) {
+ vect.noNulls = false;
+ vect.isNull[row] = true;
+ } else {
+ TimestampColumnVector vector = (TimestampColumnVector) vect;
+ vector.set(row, Timestamp.valueOf(value.getAsString()
+ .replaceAll("[TZ]", " ")));
+ }
+ }
+ }
+
+ static class DecimalColumnConverter implements JsonConverter {
+ public void convert(JsonElement value, ColumnVector vect, int row) {
+ if (value == null || value.isJsonNull()) {
+ vect.noNulls = false;
+ vect.isNull[row] = true;
+ } else {
+ DecimalColumnVector vector = (DecimalColumnVector) vect;
+ vector.vector[row].set(HiveDecimal.create(value.getAsString()));
+ }
+ }
+ }
+
+ static class StructColumnConverter implements JsonConverter {
+ private JsonConverter[] childrenConverters;
+ private List<String> fieldNames;
+
+ public StructColumnConverter(TypeDescription schema) {
+ List<TypeDescription> kids = schema.getChildren();
+ childrenConverters = new JsonConverter[kids.size()];
+ for(int c=0; c < childrenConverters.length; ++c) {
+ childrenConverters[c] = createConverter(kids.get(c));
+ }
+ fieldNames = schema.getFieldNames();
+ }
+
+ public void convert(JsonElement value, ColumnVector vect, int row) {
+ if (value == null || value.isJsonNull()) {
+ vect.noNulls = false;
+ vect.isNull[row] = true;
+ } else {
+ StructColumnVector vector = (StructColumnVector) vect;
+ JsonObject obj = value.getAsJsonObject();
+ for(int c=0; c < childrenConverters.length; ++c) {
+ JsonElement elem = obj.get(fieldNames.get(c));
+ childrenConverters[c].convert(elem, vector.fields[c], row);
+ }
+ }
+ }
+ }
+
+ static class ListColumnConverter implements JsonConverter {
+ private JsonConverter childrenConverter;
+
+ public ListColumnConverter(TypeDescription schema) {
+ childrenConverter = createConverter(schema.getChildren().get(0));
+ }
+
+ public void convert(JsonElement value, ColumnVector vect, int row) {
+ if (value == null || value.isJsonNull()) {
+ vect.noNulls = false;
+ vect.isNull[row] = true;
+ } else {
+ ListColumnVector vector = (ListColumnVector) vect;
+ JsonArray obj = value.getAsJsonArray();
+ vector.lengths[row] = obj.size();
+ vector.offsets[row] = vector.childCount;
+ vector.childCount += vector.lengths[row];
+ vector.child.ensureSize(vector.childCount, true);
+ for(int c=0; c < obj.size(); ++c) {
+ childrenConverter.convert(obj.get(c), vector.child,
+ (int) vector.offsets[row] + c);
+ }
+ }
+ }
+ }
+
+ static JsonConverter createConverter(TypeDescription schema) {
+ switch (schema.getCategory()) {
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ return new LongColumnConverter();
+ case FLOAT:
+ case DOUBLE:
+ return new DoubleColumnConverter();
+ case CHAR:
+ case VARCHAR:
+ case STRING:
+ return new StringColumnConverter();
+ case DECIMAL:
+ return new DecimalColumnConverter();
+ case TIMESTAMP:
+ return new TimestampColumnConverter();
+ case BINARY:
+ return new BinaryColumnConverter();
+ case BOOLEAN:
+ return new BooleanColumnConverter();
+ case STRUCT:
+ return new StructColumnConverter(schema);
+ case LIST:
+ return new ListColumnConverter(schema);
+ default:
+ throw new IllegalArgumentException("Unhandled type " + schema);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/48ba9241/java/bench/src/java/org/apache/orc/bench/convert/json/JsonWriter.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/convert/json/JsonWriter.java b/java/bench/src/java/org/apache/orc/bench/convert/json/JsonWriter.java
new file mode 100644
index 0000000..bd41115
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/convert/json/JsonWriter.java
@@ -0,0 +1,217 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench.convert.json;
+
+import com.fasterxml.jackson.core.JsonFactory;
+import com.fasterxml.jackson.core.JsonGenerator;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.serde2.io.DateWritable;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.bench.convert.BatchWriter;
+import org.apache.orc.bench.CompressionKind;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.nio.charset.StandardCharsets;
+import java.util.List;
+
+public class JsonWriter implements BatchWriter {
+ private final Writer outStream;
+ private final JsonGenerator writer;
+ private final TypeDescription schema;
+
+ public JsonWriter(Path path, TypeDescription schema,
+ Configuration conf,
+ CompressionKind compression) throws IOException {
+ OutputStream file = path.getFileSystem(conf).create(path, true);
+ outStream = new OutputStreamWriter(compression.create(file),
+ StandardCharsets.UTF_8);
+ JsonFactory factory = new JsonFactory();
+ factory.setRootValueSeparator("\n");
+ writer = factory.createGenerator(outStream);
+ this.schema = schema;
+ }
+
+ private static void printMap(JsonGenerator writer,
+ MapColumnVector vector,
+ TypeDescription schema,
+ int row) throws IOException {
+ writer.writeStartArray();
+ TypeDescription keyType = schema.getChildren().get(0);
+ TypeDescription valueType = schema.getChildren().get(1);
+ int offset = (int) vector.offsets[row];
+ for (int i = 0; i < vector.lengths[row]; ++i) {
+ writer.writeStartObject();
+ writer.writeFieldName("_key");
+ printValue(writer, vector.keys, keyType, offset + i);
+ writer.writeFieldName("_value");
+ printValue(writer, vector.values, valueType, offset + i);
+ writer.writeEndObject();
+ }
+ writer.writeEndArray();
+ }
+
+ private static void printList(JsonGenerator writer,
+ ListColumnVector vector,
+ TypeDescription schema,
+ int row) throws IOException {
+ writer.writeStartArray();
+ int offset = (int) vector.offsets[row];
+ TypeDescription childType = schema.getChildren().get(0);
+ for (int i = 0; i < vector.lengths[row]; ++i) {
+ printValue(writer, vector.child, childType, offset + i);
+ }
+ writer.writeEndArray();
+ }
+
+ private static void printUnion(JsonGenerator writer,
+ UnionColumnVector vector,
+ TypeDescription schema,
+ int row) throws IOException {
+ int tag = vector.tags[row];
+ printValue(writer, vector.fields[tag], schema.getChildren().get(tag), row);
+ }
+
+ static void printStruct(JsonGenerator writer,
+ StructColumnVector batch,
+ TypeDescription schema,
+ int row) throws IOException {
+ writer.writeStartObject();
+ List<String> fieldNames = schema.getFieldNames();
+ List<TypeDescription> fieldTypes = schema.getChildren();
+ for (int i = 0; i < fieldTypes.size(); ++i) {
+ writer.writeFieldName(fieldNames.get(i));
+ printValue(writer, batch.fields[i], fieldTypes.get(i), row);
+ }
+ writer.writeEndObject();
+ }
+
+ static void printBinary(JsonGenerator writer, BytesColumnVector vector,
+ int row) throws IOException {
+ StringBuilder buffer = new StringBuilder();
+ int offset = vector.start[row];
+ for(int i=0; i < vector.length[row]; ++i) {
+ int value = 0xff & (int) vector.vector[row][offset + i];
+ buffer.append(String.format("%02x", value));
+ }
+ writer.writeString(buffer.toString());
+ }
+
+ static void printValue(JsonGenerator writer, ColumnVector vector,
+ TypeDescription schema, int row) throws IOException {
+ if (vector.isRepeating) {
+ row = 0;
+ }
+ if (vector.noNulls || !vector.isNull[row]) {
+ switch (schema.getCategory()) {
+ case BOOLEAN:
+ writer.writeBoolean(((LongColumnVector) vector).vector[row] != 0);
+ break;
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ writer.writeNumber(((LongColumnVector) vector).vector[row]);
+ break;
+ case FLOAT:
+ case DOUBLE:
+ writer.writeNumber(((DoubleColumnVector) vector).vector[row]);
+ break;
+ case STRING:
+ case CHAR:
+ case VARCHAR:
+ writer.writeString(((BytesColumnVector) vector).toString(row));
+ break;
+ case BINARY:
+ printBinary(writer, (BytesColumnVector) vector, row);
+ break;
+ case DECIMAL:
+ writer.writeString(((DecimalColumnVector) vector).vector[row].toString());
+ break;
+ case DATE:
+ writer.writeString(new DateWritable(
+ (int) ((LongColumnVector) vector).vector[row]).toString());
+ break;
+ case TIMESTAMP:
+ writer.writeString(((TimestampColumnVector) vector)
+ .asScratchTimestamp(row).toString());
+ break;
+ case LIST:
+ printList(writer, (ListColumnVector) vector, schema, row);
+ break;
+ case MAP:
+ printMap(writer, (MapColumnVector) vector, schema, row);
+ break;
+ case STRUCT:
+ printStruct(writer, (StructColumnVector) vector, schema, row);
+ break;
+ case UNION:
+ printUnion(writer, (UnionColumnVector) vector, schema, row);
+ break;
+ default:
+ throw new IllegalArgumentException("Unknown type " +
+ schema.toString());
+ }
+ } else {
+ writer.writeNull();
+ }
+ }
+
+ static void printRow(JsonGenerator writer,
+ VectorizedRowBatch batch,
+ TypeDescription schema,
+ int row) throws IOException {
+ if (schema.getCategory() == TypeDescription.Category.STRUCT) {
+ List<TypeDescription> fieldTypes = schema.getChildren();
+ List<String> fieldNames = schema.getFieldNames();
+ writer.writeStartObject();
+ for (int c = 0; c < batch.cols.length; ++c) {
+ writer.writeFieldName(fieldNames.get(c));
+ printValue(writer, batch.cols[c], fieldTypes.get(c), row);
+ }
+ writer.writeEndObject();
+ } else {
+ printValue(writer, batch.cols[0], schema, row);
+ }
+ }
+
+ public void writeBatch(VectorizedRowBatch batch) throws IOException {
+ for (int r = 0; r < batch.size; ++r) {
+ printRow(writer, batch, schema, r);
+ }
+ }
+
+ public void close() throws IOException {
+ writer.close();
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/48ba9241/java/bench/src/java/org/apache/orc/bench/convert/orc/OrcReader.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/convert/orc/OrcReader.java b/java/bench/src/java/org/apache/orc/bench/convert/orc/OrcReader.java
new file mode 100644
index 0000000..e648856
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/convert/orc/OrcReader.java
@@ -0,0 +1,50 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench.convert.orc;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.OrcFile;
+import org.apache.orc.Reader;
+import org.apache.orc.RecordReader;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.bench.convert.BatchReader;
+
+import java.io.IOException;
+
+public class OrcReader implements BatchReader {
+ private final RecordReader reader;
+
+ public OrcReader(Path path,
+ TypeDescription schema,
+ Configuration conf
+ ) throws IOException {
+ Reader file = OrcFile.createReader(path, OrcFile.readerOptions(conf));
+ reader = file.rows(file.options().schema(schema));
+ }
+
+ public boolean nextBatch(VectorizedRowBatch batch) throws IOException {
+ return reader.nextBatch(batch);
+ }
+
+ public void close() throws IOException {
+ reader.close();
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/48ba9241/java/bench/src/java/org/apache/orc/bench/convert/orc/OrcWriter.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/convert/orc/OrcWriter.java b/java/bench/src/java/org/apache/orc/bench/convert/orc/OrcWriter.java
new file mode 100644
index 0000000..af5de9b
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/convert/orc/OrcWriter.java
@@ -0,0 +1,54 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench.convert.orc;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.OrcFile;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.Writer;
+import org.apache.orc.bench.convert.BatchWriter;
+import org.apache.orc.bench.CompressionKind;
+import org.apache.orc.bench.Utilities;
+
+import java.io.IOException;
+
+public class OrcWriter implements BatchWriter {
+ private final Writer writer;
+
+ public OrcWriter(Path path,
+ TypeDescription schema,
+ Configuration conf,
+ CompressionKind compression
+ ) throws IOException {
+ writer = OrcFile.createWriter(path,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .compress(Utilities.getCodec(compression)));
+ }
+
+ public void writeBatch(VectorizedRowBatch batch) throws IOException {
+ writer.addRowBatch(batch);
+ }
+
+ public void close() throws IOException {
+ writer.close();
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/48ba9241/java/bench/src/java/org/apache/orc/bench/convert/parquet/ParquetReader.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/convert/parquet/ParquetReader.java b/java/bench/src/java/org/apache/orc/bench/convert/parquet/ParquetReader.java
new file mode 100644
index 0000000..83f70f4
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/convert/parquet/ParquetReader.java
@@ -0,0 +1,297 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench.convert.parquet;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat;
+import org.apache.hadoop.hive.serde2.io.DoubleWritable;
+import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
+import org.apache.hadoop.hive.serde2.io.TimestampWritable;
+import org.apache.hadoop.io.ArrayWritable;
+import org.apache.hadoop.io.BooleanWritable;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapred.FileSplit;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.RecordReader;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.bench.convert.BatchReader;
+
+import java.io.IOException;
+import java.util.List;
+
+public class ParquetReader implements BatchReader {
+
+ private final NullWritable nada = NullWritable.get();
+ private final RecordReader<NullWritable,ArrayWritable> reader;
+ private final ArrayWritable value;
+ private final Converter[] converters;
+
+ public ParquetReader(Path path,
+ TypeDescription schema,
+ Configuration conf) throws IOException {
+ FileSplit split = new FileSplit(path, 0, Long.MAX_VALUE, new String[]{});
+ JobConf jobConf = new JobConf(conf);
+ reader = new MapredParquetInputFormat().getRecordReader(split, jobConf,
+ Reporter.NULL);
+ value = reader.createValue();
+ converters = new Converter[schema.getChildren().size()];
+ List<TypeDescription> children = schema.getChildren();
+ for(int c = 0; c < converters.length; ++c) {
+ converters[c] = createConverter(children.get(c));
+ }
+ }
+
+ @Override
+ public boolean nextBatch(VectorizedRowBatch batch) throws IOException {
+ batch.reset();
+ int maxSize = batch.getMaxSize();
+ while (batch.size < maxSize && reader.next(nada, value)) {
+ Writable[] values = value.get();
+ int row = batch.size++;
+ for(int c=0; c < batch.cols.length; ++c) {
+ converters[c].convert(batch.cols[c], row, values[c]);
+ }
+ }
+ return batch.size != 0;
+ }
+
+ @Override
+ public void close() throws IOException {
+ reader.close();
+ }
+
+ interface Converter {
+ void convert(ColumnVector vector, int row, Object value);
+ }
+
+ private static class BooleanConverter implements Converter {
+ public void convert(ColumnVector cv, int row, Object value) {
+ if (value == null) {
+ cv.noNulls = false;
+ cv.isNull[row] = true;
+ } else {
+ ((LongColumnVector) cv).vector[row] =
+ ((BooleanWritable) value).get() ? 1 : 0;
+ }
+ }
+ }
+
+ private static class IntConverter implements Converter {
+ public void convert(ColumnVector cv, int row, Object value) {
+ if (value == null) {
+ cv.noNulls = false;
+ cv.isNull[row] = true;
+ } else {
+ ((LongColumnVector) cv).vector[row] =
+ ((IntWritable) value).get();
+ }
+ }
+ }
+
+ private static class LongConverter implements Converter {
+ public void convert(ColumnVector cv, int row, Object value) {
+ if (value == null) {
+ cv.noNulls = false;
+ cv.isNull[row] = true;
+ } else {
+ ((LongColumnVector) cv).vector[row] =
+ ((LongWritable) value).get();
+ }
+ }
+ }
+
+ private static class FloatConverter implements Converter {
+ public void convert(ColumnVector cv, int row, Object value) {
+ if (value == null) {
+ cv.noNulls = false;
+ cv.isNull[row] = true;
+ } else {
+ ((DoubleColumnVector) cv).vector[row] =
+ ((FloatWritable) value).get();
+ }
+ }
+ }
+
+ private static class DoubleConverter implements Converter {
+ public void convert(ColumnVector cv, int row, Object value) {
+ if (value == null) {
+ cv.noNulls = false;
+ cv.isNull[row] = true;
+ } else {
+ ((DoubleColumnVector) cv).vector[row] =
+ ((DoubleWritable) value).get();
+ }
+ }
+ }
+
+ private static class StringConverter implements Converter {
+ public void convert(ColumnVector cv, int row, Object value) {
+ if (value == null) {
+ cv.noNulls = false;
+ cv.isNull[row] = true;
+ } else {
+ Text castValue = (Text) value;
+ ((BytesColumnVector) cv).setVal(row, castValue.getBytes(), 0,
+ castValue.getLength());
+ }
+ }
+ }
+
+ private static class BinaryConverter implements Converter {
+ public void convert(ColumnVector cv, int row, Object value) {
+ if (value == null) {
+ cv.noNulls = false;
+ cv.isNull[row] = true;
+ } else {
+ BytesWritable buf = (BytesWritable) value;
+ ((BytesColumnVector) cv).setVal(row, buf.getBytes(), 0,
+ buf.getLength());
+ }
+ }
+ }
+
+ private static class TimestampConverter implements Converter {
+ public void convert(ColumnVector cv, int row, Object value) {
+ if (value == null) {
+ cv.noNulls = false;
+ cv.isNull[row] = true;
+ } else {
+ TimestampColumnVector tc = (TimestampColumnVector) cv;
+ tc.time[row] = ((TimestampWritable) value).getSeconds();
+ tc.nanos[row] = ((TimestampWritable) value).getNanos();
+ }
+ }
+ }
+
+ private static class DecimalConverter implements Converter {
+ final int scale;
+ DecimalConverter(int scale) {
+ this.scale = scale;
+ }
+ public void convert(ColumnVector cv, int row, Object value) {
+ if (value == null) {
+ cv.noNulls = false;
+ cv.isNull[row] = true;
+ } else {
+ DecimalColumnVector tc = (DecimalColumnVector) cv;
+ tc.vector[row].set((HiveDecimalWritable) value);
+ }
+ }
+ }
+
+ private static class ListConverter implements Converter {
+ final Converter childConverter;
+
+ ListConverter(TypeDescription schema) {
+ childConverter = createConverter(schema.getChildren().get(0));
+ }
+
+ public void convert(ColumnVector cv, int row, Object value) {
+ if (value == null) {
+ cv.noNulls = false;
+ cv.isNull[row] = true;
+ } else {
+ ListColumnVector tc = (ListColumnVector) cv;
+ Writable[] array = ((ArrayWritable) value).get();
+ int start = tc.childCount;
+ int len = array.length;
+ tc.childCount += len;
+ tc.child.ensureSize(tc.childCount, true);
+ for(int i=0; i < len; ++i) {
+ childConverter.convert(tc.child, start + i, array[i]);
+ }
+ }
+ }
+ }
+
+ private static class StructConverter implements Converter {
+ final Converter[] childConverters;
+
+ StructConverter(TypeDescription schema) {
+ List<TypeDescription> children = schema.getChildren();
+ childConverters = new Converter[children.size()];
+ for(int i=0; i < childConverters.length; ++i) {
+ childConverters[i] = createConverter(children.get(i));
+ }
+ }
+
+ public void convert(ColumnVector cv, int row, Object value) {
+ if (value == null) {
+ cv.noNulls = false;
+ cv.isNull[row] = true;
+ } else {
+ StructColumnVector tc = (StructColumnVector) cv;
+ Writable[] record = ((ArrayWritable) value).get();
+ for(int c=0; c < tc.fields.length; ++c) {
+ childConverters[c].convert(tc.fields[c], row, record[c]);
+ }
+ }
+ }
+ }
+
+ static Converter createConverter(TypeDescription types) {
+ switch (types.getCategory()) {
+ case BINARY:
+ return new BinaryConverter();
+ case BOOLEAN:
+ return new BooleanConverter();
+ case BYTE:
+ case SHORT:
+ case INT:
+ return new IntConverter();
+ case LONG:
+ return new LongConverter();
+ case FLOAT:
+ return new FloatConverter();
+ case DOUBLE:
+ return new DoubleConverter();
+ case CHAR:
+ case VARCHAR:
+ case STRING:
+ return new StringConverter();
+ case TIMESTAMP:
+ return new TimestampConverter();
+ case DECIMAL:
+ return new DecimalConverter(types.getScale());
+ case LIST:
+ return new ListConverter(types);
+ case STRUCT:
+ return new StructConverter(types);
+ default:
+ throw new IllegalArgumentException("Unhandled type " + types);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/48ba9241/java/bench/src/java/org/apache/orc/bench/convert/parquet/ParquetWriter.java
----------------------------------------------------------------------
diff --git a/java/bench/src/java/org/apache/orc/bench/convert/parquet/ParquetWriter.java b/java/bench/src/java/org/apache/orc/bench/convert/parquet/ParquetWriter.java
new file mode 100644
index 0000000..075060e
--- /dev/null
+++ b/java/bench/src/java/org/apache/orc/bench/convert/parquet/ParquetWriter.java
@@ -0,0 +1,86 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench.convert.parquet;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.FileSinkOperator;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.ql.io.orc.OrcBenchmarkUtilities;
+import org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat;
+import org.apache.hadoop.hive.serde2.io.ParquetHiveRecord;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.bench.convert.BatchWriter;
+import org.apache.orc.bench.CompressionKind;
+import org.apache.orc.bench.Utilities;
+import org.apache.parquet.hadoop.ParquetOutputFormat;
+import org.apache.parquet.hadoop.metadata.CompressionCodecName;
+
+import java.io.IOException;
+import java.util.Properties;
+
+public class ParquetWriter implements BatchWriter {
+ private final FileSinkOperator.RecordWriter writer;
+ private final TypeDescription schema;
+ private final ParquetHiveRecord record;
+
+ public ParquetWriter(Path path,
+ TypeDescription schema,
+ Configuration conf,
+ CompressionKind compression
+ ) throws IOException {
+ JobConf jobConf = new JobConf(conf);
+ Properties tableProperties = Utilities.convertSchemaToHiveConfig(schema);
+ this.schema = schema;
+ jobConf.set(ParquetOutputFormat.COMPRESSION, getCodec(compression).name());
+ writer = new MapredParquetOutputFormat().getHiveRecordWriter(jobConf, path,
+ ParquetHiveRecord.class, compression != CompressionKind.NONE,
+ tableProperties, Reporter.NULL);
+ record = new ParquetHiveRecord(null,
+ OrcBenchmarkUtilities.createObjectInspector(schema));
+ }
+
+ public void writeBatch(VectorizedRowBatch batch) throws IOException {
+ for(int r=0; r < batch.size; ++r) {
+ record.value = OrcBenchmarkUtilities.nextObject(batch, schema, r,
+ (Writable) record.value);
+ writer.write(record);
+ }
+ }
+
+ public void close() throws IOException {
+ writer.close(false);
+ }
+
+ public static CompressionCodecName getCodec(CompressionKind kind) {
+ switch (kind) {
+ case NONE:
+ return CompressionCodecName.UNCOMPRESSED;
+ case ZLIB:
+ return CompressionCodecName.GZIP;
+ case SNAPPY:
+ return CompressionCodecName.SNAPPY;
+ default:
+ throw new IllegalArgumentException("Unsupported codec " + kind);
+ }
+ }
+}