You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2016/08/06 04:29:11 UTC
incubator-impala git commit: IMPALA-2878: Fix Base64Decode error and
remove duplicate codes.
Repository: incubator-impala
Updated Branches:
refs/heads/master 17bf14417 -> a46b73191
IMPALA-2878: Fix Base64Decode error and remove duplicate codes.
Original impala::Base64Decode() method wouldn't return original
string if there were trailing '\0'. For example, string "a\0" would
be encoded into "YQA=", while calling original Base64Decode() method,
the return value is "a", which losts the trailing '\0' of original
string "a\0". Besides, this commit remove duplicate codes of function
impala::Base64En/Decode() and impala::StringFunctions::Base64En/Decode()
Change-Id: I0170a7d180ab048d0ff2196a24ddc53626aa7aab
Reviewed-on: http://gerrit.cloudera.org:8080/3824
Reviewed-by: Yuanhao Luo <lu...@software.ict.ac.cn>
Reviewed-by: Jim Apple <jb...@cloudera.com>
Tested-by: Internal Jenkins
Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/a46b7319
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/a46b7319
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/a46b7319
Branch: refs/heads/master
Commit: a46b731915bbe8c71f4f0ca7aacc839edebcf6c5
Parents: 17bf144
Author: luoyuanhao <lu...@software.ict.ac.cn>
Authored: Mon Aug 1 10:04:11 2016 +0800
Committer: Internal Jenkins <cl...@gerrit.cloudera.org>
Committed: Sat Aug 6 02:40:46 2016 +0000
----------------------------------------------------------------------
be/src/exec/hdfs-table-sink.cc | 2 +-
be/src/exprs/string-functions-ir.cc | 44 ++----
be/src/service/impala-http-handler.cc | 2 +-
be/src/util/CMakeLists.txt | 4 +-
be/src/util/coding-util-test.cc | 123 ++++++++++++++++
be/src/util/coding-util.cc | 217 +++++++++++++++++++++++++++++
be/src/util/coding-util.h | 78 +++++++++++
be/src/util/runtime-profile.cc | 2 +-
be/src/util/thread.cc | 2 +-
be/src/util/url-coding-test.cc | 110 ---------------
be/src/util/url-coding.cc | 192 -------------------------
be/src/util/url-coding.h | 62 ---------
be/src/util/webserver.cc | 2 +-
13 files changed, 440 insertions(+), 400 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a46b7319/be/src/exec/hdfs-table-sink.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-table-sink.cc b/be/src/exec/hdfs-table-sink.cc
index 9132638..caf7b6d 100644
--- a/be/src/exec/hdfs-table-sink.cc
+++ b/be/src/exec/hdfs-table-sink.cc
@@ -29,7 +29,7 @@
#include "runtime/string-value.inline.h"
#include "util/impalad-metrics.h"
#include "runtime/mem-tracker.h"
-#include "util/url-coding.h"
+#include "util/coding-util.h"
#include <vector>
#include <sstream>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a46b7319/be/src/exprs/string-functions-ir.cc
----------------------------------------------------------------------
diff --git a/be/src/exprs/string-functions-ir.cc b/be/src/exprs/string-functions-ir.cc
index f524903..1c0de9d 100644
--- a/be/src/exprs/string-functions-ir.cc
+++ b/be/src/exprs/string-functions-ir.cc
@@ -24,7 +24,7 @@
#include "exprs/expr.h"
#include "runtime/string-value.inline.h"
#include "runtime/tuple-row.h"
-#include "sasl/saslutil.h"
+#include "util/coding-util.h"
#include "util/url-parser.h"
#include "common/names.h"
@@ -802,12 +802,8 @@ StringVal StringFunctions::SplitPart(FunctionContext* context,
StringVal StringFunctions::Base64Encode(FunctionContext* ctx, const StringVal& str) {
if (str.is_null) return StringVal::null();
if (str.len == 0) return StringVal(ctx, 0);
- // Base64 encoding turns every 3 bytes into 4 characters. If the length is not divisible
- // by 3, it pads the input with extra 0 bytes until it is divisible by 3. One more
- // character must be allocated to account for sasl_encode64's null-padding of its
- // output.
- const unsigned out_max = 1 + 4 * ((static_cast<unsigned>(str.len) + 2) / 3);
- if (UNLIKELY(out_max > static_cast<unsigned>(std::numeric_limits<int>::max()))) {
+ int64_t out_max = 0;
+ if (UNLIKELY(!Base64EncodeBufLen(str.len, &out_max))) {
stringstream ss;
ss << "Could not base64 encode a string of length " << str.len;
ctx->AddWarning(ss.str().c_str());
@@ -815,10 +811,10 @@ StringVal StringFunctions::Base64Encode(FunctionContext* ctx, const StringVal& s
}
StringVal result(ctx, out_max);
if (UNLIKELY(result.is_null)) return result;
- unsigned out_len = 0;
- const int encode_result = sasl_encode64(reinterpret_cast<const char*>(str.ptr), str.len,
- reinterpret_cast<char*>(result.ptr), out_max, &out_len);
- if (UNLIKELY(encode_result != SASL_OK || out_len != out_max - 1)) {
+ int64_t out_len = 0;
+ if (UNLIKELY(!impala::Base64Encode(
+ reinterpret_cast<const char*>(str.ptr), str.len,
+ out_max, reinterpret_cast<char*>(result.ptr), &out_len))) {
stringstream ss;
ss << "Could not base64 encode input in space " << out_max
<< "; actual output length " << out_len;
@@ -832,32 +828,22 @@ StringVal StringFunctions::Base64Encode(FunctionContext* ctx, const StringVal& s
StringVal StringFunctions::Base64Decode(FunctionContext* ctx, const StringVal& str) {
if (str.is_null) return StringVal::null();
if (0 == str.len) return StringVal(ctx, 0);
- // Base64 decoding turns every 4 characters into 3 bytes. If the last character of the
- // encoded string is '=', that character (which represents 6 bits) and the last two bits
- // of the previous character is ignored, for a total of 8 ignored bits, therefore
- // producing one fewer byte of output. This is repeated if the second-to-last character
- // is '='. One more byte must be allocated to account for sasl_decode64's null-padding
- // of its output.
- if (UNLIKELY((str.len & 3) != 0)) {
+ int64_t out_max = 0;
+ if (UNLIKELY(!Base64DecodeBufLen(
+ reinterpret_cast<const char*>(str.ptr), static_cast<int64_t>(str.len),
+ &out_max))) {
stringstream ss;
ss << "Invalid base64 string; input length is " << str.len
<< ", which is not a multiple of 4.";
ctx->AddWarning(ss.str().c_str());
return StringVal::null();
}
- unsigned out_max = 1 + 3 * (str.len / 4);
- if (static_cast<char>(str.ptr[str.len - 1]) == '=') {
- --out_max;
- if (static_cast<char>(str.ptr[str.len - 2]) == '=') {
- --out_max;
- }
- }
StringVal result(ctx, out_max);
if (UNLIKELY(result.is_null)) return result;
- unsigned out_len = 0;
- const int decode_result = sasl_decode64(reinterpret_cast<const char*>(str.ptr), str.len,
- reinterpret_cast<char*>(result.ptr), out_max, &out_len);
- if (UNLIKELY(decode_result != SASL_OK || out_len != out_max - 1)) {
+ int64_t out_len = 0;
+ if (UNLIKELY(!impala::Base64Decode(
+ reinterpret_cast<const char*>(str.ptr), static_cast<int64_t>(str.len),
+ out_max, reinterpret_cast<char*>(result.ptr), &out_len))) {
stringstream ss;
ss << "Could not base64 decode input in space " << out_max
<< "; actual output length " << out_len;
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a46b7319/be/src/service/impala-http-handler.cc
----------------------------------------------------------------------
diff --git a/be/src/service/impala-http-handler.cc b/be/src/service/impala-http-handler.cc
index fba756d..332a429 100644
--- a/be/src/service/impala-http-handler.cc
+++ b/be/src/service/impala-http-handler.cc
@@ -24,10 +24,10 @@
#include "service/impala-server.h"
#include "service/query-exec-state.h"
#include "thrift/protocol/TDebugProtocol.h"
+#include "util/coding-util.h"
#include "util/redactor.h"
#include "util/summary-util.h"
#include "util/time.h"
-#include "util/url-coding.h"
#include "util/webserver.h"
#include "common/names.h"
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a46b7319/be/src/util/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/be/src/util/CMakeLists.txt b/be/src/util/CMakeLists.txt
index 2bf5e49..8153eaf 100644
--- a/be/src/util/CMakeLists.txt
+++ b/be/src/util/CMakeLists.txt
@@ -34,6 +34,7 @@ add_library(Util
bit-util.cc
bloom-filter.cc
cgroups-mgr.cc
+ coding-util.cc
codec.cc
compress.cc
cpu-info.cc
@@ -79,7 +80,6 @@ add_library(Util
time.cc
tuple-row-compare.cc
url-parser.cc
- url-coding.cc
)
add_dependencies(Util thrift-deps gen_ir_descriptions)
@@ -111,7 +111,7 @@ ADD_BE_TEST(benchmark-test)
ADD_BE_TEST(decompress-test)
ADD_BE_TEST(metrics-test)
ADD_BE_TEST(debug-util-test)
-ADD_BE_TEST(url-coding-test)
+ADD_BE_TEST(coding-util-test)
ADD_BE_TEST(bit-util-test)
ADD_BE_TEST(rle-test)
ADD_BE_TEST(blocking-queue-test)
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a46b7319/be/src/util/coding-util-test.cc
----------------------------------------------------------------------
diff --git a/be/src/util/coding-util-test.cc b/be/src/util/coding-util-test.cc
new file mode 100644
index 0000000..c7b0244
--- /dev/null
+++ b/be/src/util/coding-util-test.cc
@@ -0,0 +1,123 @@
+// Copyright 2012 Cloudera Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <iostream>
+#include <gtest/gtest.h>
+
+#include "common/logging.h"
+#include "util/coding-util.h"
+
+#include "common/names.h"
+
+namespace impala {
+
+// Tests encoding/decoding of input. If expected_encoded is non-empty, the
+// encoded string is validated against it.
+void TestUrl(const string& input, const string& expected_encoded, bool hive_compat) {
+ string intermediate;
+ UrlEncode(input, &intermediate, hive_compat);
+ string output;
+ if (!expected_encoded.empty()) {
+ EXPECT_EQ(intermediate, expected_encoded);
+ }
+ EXPECT_TRUE(UrlDecode(intermediate, &output, hive_compat));
+ EXPECT_EQ(input, output);
+
+ // Convert string to vector and try that also
+ vector<uint8_t> input_vector;
+ input_vector.resize(input.size());
+ memcpy(&input_vector[0], input.c_str(), input.size());
+ string intermediate2;
+ UrlEncode(input_vector, &intermediate2, hive_compat);
+ EXPECT_EQ(intermediate, intermediate2);
+}
+
+void TestBase64(const string& input, const string& expected_encoded) {
+ string intermediate;
+ Base64Encode(input, &intermediate);
+ if (!expected_encoded.empty()) {
+ EXPECT_EQ(intermediate, expected_encoded);
+ }
+ int64_t out_max = 0;
+ EXPECT_TRUE(Base64DecodeBufLen(intermediate.c_str(), intermediate.size(), &out_max));
+ string output(out_max, '\0');
+ int64_t out_len = 0;
+ EXPECT_TRUE(Base64Decode(intermediate.c_str(), intermediate.size(),
+ out_max, const_cast<char*>(output.c_str()), &out_len));
+ output.resize(out_len);
+ EXPECT_EQ(input, output);
+
+ // Convert string to vector and try that also
+ vector<uint8_t> input_vector;
+ input_vector.resize(input.size());
+ memcpy(&input_vector[0], input.c_str(), input.size());
+ string intermediate2;
+ Base64Encode(input_vector, &intermediate2);
+ EXPECT_EQ(intermediate, intermediate2);
+}
+
+// Test URL encoding. Check that the values that are put in are the
+// same that come out.
+TEST(UrlCodingTest, Basic) {
+ string input = "ABCDEFGHIJKLMNOPQRSTUWXYZ1234567890~!@#$%^&*()<>?,./:\";'{}|[]\\_+-=";
+ TestUrl(input, "", false);
+ TestUrl(input, "", true);
+}
+
+TEST(UrlCodingTest, HiveExceptions) {
+ TestUrl(" +", " +", true);
+}
+
+TEST(UrlCodingTest, BlankString) {
+ TestUrl("", "", false);
+ TestUrl("", "", true);
+}
+
+TEST(UrlCodingTest, PathSeparators) {
+ TestUrl("/home/impala/directory/", "%2Fhome%2Fimpala%2Fdirectory%2F", false);
+ TestUrl("/home/impala/directory/", "%2Fhome%2Fimpala%2Fdirectory%2F", true);
+}
+
+TEST(Base64Test, Basic) {
+ TestBase64("a", "YQ==");
+ TestBase64("ab", "YWI=");
+ TestBase64("abc", "YWJj");
+ TestBase64("abcd", "YWJjZA==");
+ TestBase64("abcde", "YWJjZGU=");
+ TestBase64("abcdef", "YWJjZGVm");
+ TestBase64(string("a\0", 2), "YQA=");
+ TestBase64(string("ab\0", 3), "YWIA");
+ TestBase64(string("abc\0", 4), "YWJjAA==");
+ TestBase64(string("abcd\0", 5), "YWJjZAA=");
+ TestBase64(string("abcde\0", 6), "YWJjZGUA");
+ TestBase64(string("abcdef\0", 7), "YWJjZGVmAA==");
+ TestBase64(string("a\0b", 3), "YQBi");
+ TestBase64(string("a\0b\0", 4), "YQBiAA==");
+}
+
+TEST(HtmlEscapingTest, Basic) {
+ string before = "<html><body>&";
+ stringstream after;
+ EscapeForHtml(before, &after);
+ EXPECT_EQ(after.str(), "<html><body>&amp");
+}
+
+}
+
+int main(int argc, char **argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a46b7319/be/src/util/coding-util.cc
----------------------------------------------------------------------
diff --git a/be/src/util/coding-util.cc b/be/src/util/coding-util.cc
new file mode 100644
index 0000000..bba6fc1
--- /dev/null
+++ b/be/src/util/coding-util.cc
@@ -0,0 +1,217 @@
+// Copyright 2012 Cloudera Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "util/coding-util.h"
+
+#include <exception>
+#include <sstream>
+#include <boost/algorithm/string.hpp>
+
+#include "common/compiler-util.h"
+#include "common/logging.h"
+
+#include "common/names.h"
+#include "sasl/saslutil.h"
+
+using boost::algorithm::is_any_of;
+using namespace impala;
+using std::uppercase;
+
+namespace impala {
+
+// Hive selectively encodes characters. This is the whitelist of
+// characters it will encode.
+// See common/src/java/org/apache/hadoop/hive/common/FileUtils.java
+// in the Hive source code for the source of this list.
+static function<bool (char)> HiveShouldEscape = is_any_of("\"#%\\*/:=?\u00FF");
+
+// It is more convenient to maintain the complement of the set of
+// characters to escape when not in Hive-compat mode.
+static function<bool (char)> ShouldNotEscape = is_any_of("-_.~");
+
+static inline void UrlEncode(const char* in, int in_len, string* out, bool hive_compat) {
+ (*out).reserve(in_len);
+ stringstream ss;
+ for (int i = 0; i < in_len; ++i) {
+ const char ch = in[i];
+ // Escape the character iff a) we are in Hive-compat mode and the
+ // character is in the Hive whitelist or b) we are not in
+ // Hive-compat mode, and the character is not alphanumeric or one
+ // of the four commonly excluded characters.
+ if ((hive_compat && HiveShouldEscape(ch)) ||
+ (!hive_compat && !(isalnum(ch) || ShouldNotEscape(ch)))) {
+ ss << '%' << uppercase << hex << static_cast<uint32_t>(ch);
+ } else {
+ ss << ch;
+ }
+ }
+
+ (*out) = ss.str();
+}
+
+void UrlEncode(const vector<uint8_t>& in, string* out, bool hive_compat) {
+ if (in.empty()) {
+ *out = "";
+ } else {
+ UrlEncode(reinterpret_cast<const char*>(&in[0]), in.size(), out, hive_compat);
+ }
+}
+
+void UrlEncode(const string& in, string* out, bool hive_compat) {
+ UrlEncode(in.c_str(), in.size(), out, hive_compat);
+}
+
+// Adapted from
+// http://www.boost.org/doc/libs/1_40_0/doc/html/boost_asio/
+// example/http/server3/request_handler.cpp
+// See http://www.boost.org/LICENSE_1_0.txt for license for this method.
+bool UrlDecode(const string& in, string* out, bool hive_compat) {
+ out->clear();
+ out->reserve(in.size());
+ for (size_t i = 0; i < in.size(); ++i) {
+ if (in[i] == '%') {
+ if (i + 3 <= in.size()) {
+ int value = 0;
+ istringstream is(in.substr(i + 1, 2));
+ if (is >> hex >> value) {
+ (*out) += static_cast<char>(value);
+ i += 2;
+ } else {
+ return false;
+ }
+ } else {
+ return false;
+ }
+ } else if (!hive_compat && in[i] == '+') { // Hive does not encode ' ' as '+'
+ (*out) += ' ';
+ } else {
+ (*out) += in[i];
+ }
+ }
+ return true;
+}
+
+bool Base64EncodeBufLen(int64_t in_len, int64_t* out_max) {
+ // Base64 encoding turns every 3 bytes into 4 characters. If the length is not
+ // divisible by 3, it pads the input with extra 0 bytes until it is divisible by 3.
+ // One more character must be allocated to account for Base64Encode's null-padding
+ // of its output.
+ *out_max = 1 + 4 * ((in_len + 2) / 3);
+ if (UNLIKELY(in_len < 0 ||
+ *out_max > static_cast<unsigned>(std::numeric_limits<int>::max()))) {
+ return false;
+ }
+ return true;
+}
+
+bool Base64Encode(const char* in, int64_t in_len, int64_t out_max, char* out,
+ int64_t* out_len) {
+ if (UNLIKELY(in_len < 0 || in_len > std::numeric_limits<unsigned>::max() ||
+ out_max < 0 || out_max > std::numeric_limits<unsigned>::max())) {
+ return false;
+ }
+ const int encode_result = sasl_encode64(in, static_cast<unsigned>(in_len), out,
+ static_cast<unsigned>(out_max), reinterpret_cast<unsigned*>(out_len));
+ if (UNLIKELY(encode_result != SASL_OK || *out_len != out_max - 1)) return false;
+ return true;
+}
+
+static inline void Base64Encode(const char* in, int64_t in_len, stringstream* out) {
+ if (in_len == 0) {
+ (*out) << "";
+ return;
+ }
+ int64_t out_max = 0;
+ if (UNLIKELY(!Base64EncodeBufLen(in_len, &out_max))) return;
+ string result(out_max, '\0');
+ int64_t out_len = 0;
+ if (UNLIKELY(!Base64Encode(in, in_len, out_max, const_cast<char*>(result.c_str()),
+ &out_len))) {
+ return;
+ }
+ result.resize(out_len);
+ (*out) << result;
+}
+
+void Base64Encode(const vector<uint8_t>& in, string* out) {
+ if (in.empty()) {
+ *out = "";
+ } else {
+ stringstream ss;
+ Base64Encode(in, &ss);
+ *out = ss.str();
+ }
+}
+
+void Base64Encode(const vector<uint8_t>& in, stringstream* out) {
+ if (!in.empty()) {
+ // Boost does not like non-null terminated strings
+ string tmp(reinterpret_cast<const char*>(&in[0]), in.size());
+ Base64Encode(tmp.c_str(), tmp.size(), out);
+ }
+}
+
+void Base64Encode(const string& in, string* out) {
+ stringstream ss;
+ Base64Encode(in.c_str(), in.size(), &ss);
+ *out = ss.str();
+}
+
+void Base64Encode(const string& in, stringstream* out) {
+ Base64Encode(in.c_str(), in.size(), out);
+}
+
+bool Base64DecodeBufLen(const char* in, int64_t in_len, int64_t* out_max) {
+ // Base64 decoding turns every 4 characters into 3 bytes. If the last character of the
+ // encoded string is '=', that character (which represents 6 bits) and the last two bits
+ // of the previous character is ignored, for a total of 8 ignored bits, therefore
+ // producing one fewer byte of output. This is repeated if the second-to-last character
+ // is '='. One more byte must be allocated to account for Base64Decode's null-padding
+ // of its output.
+ if (UNLIKELY((in_len & 3) != 0)) return false;
+ *out_max = 1 + 3 * (in_len / 4);
+ if (in[in_len - 1] == '=') {
+ --(*out_max);
+ if (in[in_len - 2] == '=') {
+ --(*out_max);
+ }
+ }
+ return true;
+}
+
+bool Base64Decode(const char* in, int64_t in_len, int64_t out_max, char* out,
+ int64_t* out_len) {
+ if (UNLIKELY((in_len & 3) != 0)) return false;
+ const int decode_result = sasl_decode64(in, static_cast<unsigned>(in_len), out,
+ static_cast<unsigned>(out_max), reinterpret_cast<unsigned*>(out_len));
+ if (UNLIKELY(decode_result != SASL_OK || *out_len != out_max - 1)) return false;
+ return true;
+}
+
+void EscapeForHtml(const string& in, stringstream* out) {
+ DCHECK(out != NULL);
+ for (const char c: in) {
+ switch (c) {
+ case '<': (*out) << "<";
+ break;
+ case '>': (*out) << ">";
+ break;
+ case '&': (*out) << "&";
+ break;
+ default: (*out) << c;
+ }
+ }
+}
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a46b7319/be/src/util/coding-util.h
----------------------------------------------------------------------
diff --git a/be/src/util/coding-util.h b/be/src/util/coding-util.h
new file mode 100644
index 0000000..5286953
--- /dev/null
+++ b/be/src/util/coding-util.h
@@ -0,0 +1,78 @@
+// Copyright 2012 Cloudera Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UTIL_CODING_UTIL_H
+#define UTIL_CODING_UTIL_H
+
+#include <string>
+#include <vector>
+#include <boost/cstdint.hpp>
+
+namespace impala {
+
+/// Utility method to URL-encode a string (that is, replace special
+/// characters with %<hex value in ascii>).
+/// The optional parameter hive_compat controls whether we mimic Hive's
+/// behaviour when encoding a string, which is only to encode certain
+/// characters (excluding, e.g., ' ')
+void UrlEncode(const std::string& in, std::string* out, bool hive_compat = false);
+void UrlEncode(const std::vector<uint8_t>& in, std::string* out,
+ bool hive_compat = false);
+
+/// Utility method to decode a string that was URL-encoded. Returns
+/// true unless the string could not be correctly decoded.
+/// The optional parameter hive_compat controls whether or not we treat
+/// the strings as encoded by Hive, which means selectively ignoring
+/// certain characters like ' '.
+bool UrlDecode(const std::string& in, std::string* out, bool hive_compat = false);
+
+/// Calculate the maximum output buffer size needed for Base64Encode. Returns false if
+/// in_len is negative or too large.
+bool Base64EncodeBufLen(int64_t in_len, int64_t* out_max);
+
+/// Returns true if encoded successfully, otherwise false. out points to the output
+/// data, the space of size out_max should be allocated before calling this function.
+/// out_len saves the actual length of encoded string.
+bool Base64Encode(const char* in, int64_t in_len, int64_t out_max, char* out,
+ int64_t* out_len);
+
+/// Utility method to encode input as base-64 encoded. This is not
+/// very performant (multiple string copies) and should not be used
+/// in a hot path.
+void Base64Encode(const std::vector<uint8_t>& in, std::string* out);
+void Base64Encode(const std::vector<uint8_t>& in, std::stringstream* out);
+void Base64Encode(const std::string& in, std::string* out);
+void Base64Encode(const std::string& in, std::stringstream* out);
+
+/// Calculate the maximum output buffer size needed for Base64Decode. Returns false if
+/// in_len is invalid.
+bool Base64DecodeBufLen(const char* in, int64_t in_len, int64_t* out_max);
+
+/// Utility method to decode a base-64 encoded string. Returns true if decoded
+/// successfully, otherwise false. out points to the output data, the space of size
+/// out_max should be allocated before calling this function. out_len saves the actual
+/// length of decoded string.
+bool Base64Decode(const char* in, int64_t in_len, int64_t out_max, char* out,
+ int64_t* out_len);
+
+/// Replaces &, < and > with &, < and > respectively. This is
+/// not the full set of required encodings, but one that should be
+/// added to on a case-by-case basis. Slow, since it necessarily
+/// inspects each character in turn, and copies them all to *out; use
+/// judiciously.
+void EscapeForHtml(const std::string& in, std::stringstream* out);
+
+}
+
+#endif
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a46b7319/be/src/util/runtime-profile.cc
----------------------------------------------------------------------
diff --git a/be/src/util/runtime-profile.cc b/be/src/util/runtime-profile.cc
index ce5e934..a16967f 100644
--- a/be/src/util/runtime-profile.cc
+++ b/be/src/util/runtime-profile.cc
@@ -23,13 +23,13 @@
#include "common/object-pool.h"
#include "rpc/thrift-util.h"
+#include "util/coding-util.h"
#include "util/compress.h"
#include "util/container-util.h"
#include "util/cpu-info.h"
#include "util/debug-util.h"
#include "util/periodic-counter-updater.h"
#include "util/redactor.h"
-#include "util/url-coding.h"
#include "common/names.h"
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a46b7319/be/src/util/thread.cc
----------------------------------------------------------------------
diff --git a/be/src/util/thread.cc b/be/src/util/thread.cc
index 6325a0d..af09db5 100644
--- a/be/src/util/thread.cc
+++ b/be/src/util/thread.cc
@@ -20,12 +20,12 @@
#include <sys/syscall.h>
#include <sys/types.h>
+#include "util/coding-util.h"
#include "util/debug-util.h"
#include "util/error-util.h"
#include "util/cgroups-mgr.h"
#include "util/metrics.h"
#include "util/webserver.h"
-#include "util/url-coding.h"
#include "util/os-util.h"
#include "common/names.h"
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a46b7319/be/src/util/url-coding-test.cc
----------------------------------------------------------------------
diff --git a/be/src/util/url-coding-test.cc b/be/src/util/url-coding-test.cc
deleted file mode 100644
index 438934b..0000000
--- a/be/src/util/url-coding-test.cc
+++ /dev/null
@@ -1,110 +0,0 @@
-// Copyright 2012 Cloudera Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <iostream>
-#include <gtest/gtest.h>
-
-#include "common/logging.h"
-#include "util/url-coding.h"
-
-#include "common/names.h"
-
-namespace impala {
-
-// Tests encoding/decoding of input. If expected_encoded is non-empty, the
-// encoded string is validated against it.
-void TestUrl(const string& input, const string& expected_encoded, bool hive_compat) {
- string intermediate;
- UrlEncode(input, &intermediate, hive_compat);
- string output;
- if (!expected_encoded.empty()) {
- EXPECT_EQ(intermediate, expected_encoded);
- }
- EXPECT_TRUE(UrlDecode(intermediate, &output, hive_compat));
- EXPECT_EQ(input, output);
-
- // Convert string to vector and try that also
- vector<uint8_t> input_vector;
- input_vector.resize(input.size());
- memcpy(&input_vector[0], input.c_str(), input.size());
- string intermediate2;
- UrlEncode(input_vector, &intermediate2, hive_compat);
- EXPECT_EQ(intermediate, intermediate2);
-}
-
-void TestBase64(const string& input, const string& expected_encoded) {
- string intermediate;
- Base64Encode(input, &intermediate);
- string output;
- if (!expected_encoded.empty()) {
- EXPECT_EQ(intermediate, expected_encoded);
- }
- EXPECT_TRUE(Base64Decode(intermediate, &output));
- EXPECT_EQ(input, output);
-
- // Convert string to vector and try that also
- vector<uint8_t> input_vector;
- input_vector.resize(input.size());
- memcpy(&input_vector[0], input.c_str(), input.size());
- string intermediate2;
- Base64Encode(input_vector, &intermediate2);
- EXPECT_EQ(intermediate, intermediate2);
-}
-
-// Test URL encoding. Check that the values that are put in are the
-// same that come out.
-TEST(UrlCodingTest, Basic) {
- string input = "ABCDEFGHIJKLMNOPQRSTUWXYZ1234567890~!@#$%^&*()<>?,./:\";'{}|[]\\_+-=";
- TestUrl(input, "", false);
- TestUrl(input, "", true);
-}
-
-TEST(UrlCodingTest, HiveExceptions) {
- TestUrl(" +", " +", true);
-}
-
-TEST(UrlCodingTest, BlankString) {
- TestUrl("", "", false);
- TestUrl("", "", true);
-}
-
-TEST(UrlCodingTest, PathSeparators) {
- TestUrl("/home/impala/directory/", "%2Fhome%2Fimpala%2Fdirectory%2F", false);
- TestUrl("/home/impala/directory/", "%2Fhome%2Fimpala%2Fdirectory%2F", true);
-}
-
-TEST(Base64Test, Basic) {
- TestBase64("a", "YQ==");
- TestBase64("ab", "YWI=");
- TestBase64("abc", "YWJj");
- TestBase64("abcd", "YWJjZA==");
- TestBase64("abcde", "YWJjZGU=");
- TestBase64("abcdef", "YWJjZGVm");
-}
-
-TEST(HtmlEscapingTest, Basic) {
- string before = "<html><body>&";
- stringstream after;
- EscapeForHtml(before, &after);
- EXPECT_EQ(after.str(), "<html><body>&amp");
-}
-
-}
-
-int main(int argc, char **argv) {
- ::testing::InitGoogleTest(&argc, argv);
- return RUN_ALL_TESTS();
-}
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a46b7319/be/src/util/url-coding.cc
----------------------------------------------------------------------
diff --git a/be/src/util/url-coding.cc b/be/src/util/url-coding.cc
deleted file mode 100644
index 20e5783..0000000
--- a/be/src/util/url-coding.cc
+++ /dev/null
@@ -1,192 +0,0 @@
-// Copyright 2012 Cloudera Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "util/url-coding.h"
-
-#include <exception>
-#include <sstream>
-#include <boost/algorithm/string.hpp>
-#include <boost/archive/iterators/base64_from_binary.hpp>
-#include <boost/archive/iterators/binary_from_base64.hpp>
-#include <boost/archive/iterators/transform_width.hpp>
-
-#include "common/logging.h"
-
-#include "common/names.h"
-
-using boost::algorithm::is_any_of;
-using boost::archive::iterators::base64_from_binary;
-using boost::archive::iterators::binary_from_base64;
-using boost::archive::iterators::transform_width;
-using namespace impala;
-using std::uppercase;
-
-namespace impala {
-
-// Hive selectively encodes characters. This is the whitelist of
-// characters it will encode.
-// See common/src/java/org/apache/hadoop/hive/common/FileUtils.java
-// in the Hive source code for the source of this list.
-static function<bool (char)> HiveShouldEscape = is_any_of("\"#%\\*/:=?\u00FF");
-
-// It is more convenient to maintain the complement of the set of
-// characters to escape when not in Hive-compat mode.
-static function<bool (char)> ShouldNotEscape = is_any_of("-_.~");
-
-static inline void UrlEncode(const char* in, int in_len, string* out, bool hive_compat) {
- (*out).reserve(in_len);
- stringstream ss;
- for (int i = 0; i < in_len; ++i) {
- const char ch = in[i];
- // Escape the character iff a) we are in Hive-compat mode and the
- // character is in the Hive whitelist or b) we are not in
- // Hive-compat mode, and the character is not alphanumeric or one
- // of the four commonly excluded characters.
- if ((hive_compat && HiveShouldEscape(ch)) ||
- (!hive_compat && !(isalnum(ch) || ShouldNotEscape(ch)))) {
- ss << '%' << uppercase << hex << static_cast<uint32_t>(ch);
- } else {
- ss << ch;
- }
- }
-
- (*out) = ss.str();
-}
-
-void UrlEncode(const vector<uint8_t>& in, string* out, bool hive_compat) {
- if (in.empty()) {
- *out = "";
- } else {
- UrlEncode(reinterpret_cast<const char*>(&in[0]), in.size(), out, hive_compat);
- }
-}
-
-void UrlEncode(const string& in, string* out, bool hive_compat) {
- UrlEncode(in.c_str(), in.size(), out, hive_compat);
-}
-
-// Adapted from
-// http://www.boost.org/doc/libs/1_40_0/doc/html/boost_asio/
-// example/http/server3/request_handler.cpp
-// See http://www.boost.org/LICENSE_1_0.txt for license for this method.
-bool UrlDecode(const string& in, string* out, bool hive_compat) {
- out->clear();
- out->reserve(in.size());
- for (size_t i = 0; i < in.size(); ++i) {
- if (in[i] == '%') {
- if (i + 3 <= in.size()) {
- int value = 0;
- istringstream is(in.substr(i + 1, 2));
- if (is >> hex >> value) {
- (*out) += static_cast<char>(value);
- i += 2;
- } else {
- return false;
- }
- } else {
- return false;
- }
- } else if (!hive_compat && in[i] == '+') { // Hive does not encode ' ' as '+'
- (*out) += ' ';
- } else {
- (*out) += in[i];
- }
- }
- return true;
-}
-
-static inline void Base64Encode(const char* in, int in_len, stringstream* out) {
- typedef base64_from_binary<transform_width<const char*, 6, 8>> base64_encode;
- // Base64 encodes 8 byte chars as 6 bit values.
- stringstream::pos_type len_before = out->tellp();
- copy(base64_encode(in), base64_encode(in + in_len), std::ostream_iterator<char>(*out));
- int bytes_written = out->tellp() - len_before;
- // Pad with = to make it valid base64 encoded string
- int num_pad = bytes_written % 4;
- if (num_pad != 0) {
- num_pad = 4 - num_pad;
- for (int i = 0; i < num_pad; ++i) {
- (*out) << "=";
- }
- }
- DCHECK_EQ((out->tellp() - len_before) % 4, 0);
-}
-
-void Base64Encode(const vector<uint8_t>& in, string* out) {
- if (in.empty()) {
- *out = "";
- } else {
- stringstream ss;
- Base64Encode(in, &ss);
- *out = ss.str();
- }
-}
-
-void Base64Encode(const vector<uint8_t>& in, stringstream* out) {
- if (!in.empty()) {
- // Boost does not like non-null terminated strings
- string tmp(reinterpret_cast<const char*>(&in[0]), in.size());
- Base64Encode(tmp.c_str(), tmp.size(), out);
- }
-}
-
-void Base64Encode(const string& in, string* out) {
- stringstream ss;
- Base64Encode(in.c_str(), in.size(), &ss);
- *out = ss.str();
-}
-
-void Base64Encode(const string& in, stringstream* out) {
- Base64Encode(in.c_str(), in.size(), out);
-}
-
-bool Base64Decode(const string& in, string* out) {
- typedef transform_width<
- binary_from_base64<string::const_iterator> , 8, 6> base64_decode;
- string tmp = in;
- // Replace padding with base64 encoded NULL
- replace(tmp.begin(), tmp.end(), '=', 'A');
- try {
- *out = string(base64_decode(tmp.begin()), base64_decode(tmp.end()));
- } catch(std::exception& e) {
- return false;
- }
-
- // Remove trailing '\0' that were added as padding. Since \0 is special,
- // the boost functions get confused so do this manually.
- int num_padded_chars = 0;
- for (int i = out->size() - 1; i >= 0; --i) {
- if ((*out)[i] != '\0') break;
- ++num_padded_chars;
- }
- out->resize(out->size() - num_padded_chars);
- return true;
-}
-
-void EscapeForHtml(const string& in, stringstream* out) {
- DCHECK(out != NULL);
- for (const char c: in) {
- switch (c) {
- case '<': (*out) << "<";
- break;
- case '>': (*out) << ">";
- break;
- case '&': (*out) << "&";
- break;
- default: (*out) << c;
- }
- }
-}
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a46b7319/be/src/util/url-coding.h
----------------------------------------------------------------------
diff --git a/be/src/util/url-coding.h b/be/src/util/url-coding.h
deleted file mode 100644
index 1ac5d5e..0000000
--- a/be/src/util/url-coding.h
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright 2012 Cloudera Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef UTIL_URL_CODING_H
-#define UTIL_URL_CODING_H
-
-#include <string>
-#include <vector>
-#include <boost/cstdint.hpp>
-
-namespace impala {
-
-/// Utility method to URL-encode a string (that is, replace special
-/// characters with %<hex value in ascii>).
-/// The optional parameter hive_compat controls whether we mimic Hive's
-/// behaviour when encoding a string, which is only to encode certain
-/// characters (excluding, e.g., ' ')
-void UrlEncode(const std::string& in, std::string* out, bool hive_compat = false);
-void UrlEncode(const std::vector<uint8_t>& in, std::string* out,
- bool hive_compat = false);
-
-/// Utility method to decode a string that was URL-encoded. Returns
-/// true unless the string could not be correctly decoded.
-/// The optional parameter hive_compat controls whether or not we treat
-/// the strings as encoded by Hive, which means selectively ignoring
-/// certain characters like ' '.
-bool UrlDecode(const std::string& in, std::string* out, bool hive_compat = false);
-
-/// Utility method to encode input as base-64 encoded. This is not
-/// very performant (multiple string copies) and should not be used
-/// in a hot path.
-void Base64Encode(const std::vector<uint8_t>& in, std::string* out);
-void Base64Encode(const std::vector<uint8_t>& in, std::stringstream* out);
-void Base64Encode(const std::string& in, std::string* out);
-void Base64Encode(const std::string& in, std::stringstream* out);
-
-/// Utility method to decode base64 encoded strings. Also not extremely
-/// performant.
-/// Returns true unless the string could not be correctly decoded.
-bool Base64Decode(const std::string& in, std::string* out);
-
-/// Replaces &, < and > with &, < and > respectively. This is
-/// not the full set of required encodings, but one that should be
-/// added to on a case-by-case basis. Slow, since it necessarily
-/// inspects each character in turn, and copies them all to *out; use
-/// judiciously.
-void EscapeForHtml(const std::string& in, std::stringstream* out);
-
-}
-
-#endif
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a46b7319/be/src/util/webserver.cc
----------------------------------------------------------------------
diff --git a/be/src/util/webserver.cc b/be/src/util/webserver.cc
index b514c36..8069e01 100644
--- a/be/src/util/webserver.cc
+++ b/be/src/util/webserver.cc
@@ -33,13 +33,13 @@
#include "rpc/thrift-util.h"
#include "thirdparty/mustache/mustache.h"
#include "util/asan.h"
+#include "util/coding-util.h"
#include "util/cpu-info.h"
#include "util/disk-info.h"
#include "util/mem-info.h"
#include "util/os-info.h"
#include "util/os-util.h"
#include "util/process-state-info.h"
-#include "util/url-coding.h"
#include "util/debug-util.h"
#include "util/pretty-printer.h"
#include "util/stopwatch.h"