You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/01/30 01:57:18 UTC

[GitHub] [arrow] kou opened a new pull request #9367: ARROW-7288: [C++][Parquet] Don't use regular expression to parse application version

kou opened a new pull request #9367:
URL: https://github.com/apache/arrow/pull/9367


   std::regex provided by MinGW may take a long with Japanese location on
   Windows.
   
   We can use std::regex, boost::regex or RE2 as regular expression
   engine for this but RE2 doesn't use compatible syntax with others. If
   we support all of them, we need to maintain multiple regular
   expressions. It increases maintenance cost. If we don't use regular
   expression, we don't need to think about regular expression. But we
   need to maintain hand-written parser.


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou commented on a change in pull request #9367: ARROW-7288: [C++][Parquet] Don't use regular expression to parse application version

Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #9367:
URL: https://github.com/apache/arrow/pull/9367#discussion_r568468397



##########
File path: cpp/src/parquet/metadata_test.cc
##########
@@ -342,5 +342,123 @@ TEST(ApplicationVersion, Basics) {
       version1.HasCorrectStatistics(Type::BYTE_ARRAY, stats_int, SortOrder::UNSIGNED));
 }
 
+TEST(ApplicationVersion, Empty) {
+  ApplicationVersion version("");
+
+  ASSERT_EQ("", version.application_);
+  ASSERT_EQ("", version.build_);
+  ASSERT_EQ(0, version.version.major);
+  ASSERT_EQ(0, version.version.minor);
+  ASSERT_EQ(0, version.version.patch);
+  ASSERT_EQ("", version.version.unknown);
+  ASSERT_EQ("", version.version.pre_release);
+  ASSERT_EQ("", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionEmpty) {
+  ApplicationVersion version("parquet-mr version ");
+
+  ASSERT_EQ("parquet-mr", version.application_);
+  ASSERT_EQ("", version.build_);
+  ASSERT_EQ(0, version.version.major);
+  ASSERT_EQ(0, version.version.minor);
+  ASSERT_EQ(0, version.version.patch);
+  ASSERT_EQ("", version.version.unknown);
+  ASSERT_EQ("", version.version.pre_release);
+  ASSERT_EQ("", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionNoMajor) {
+  ApplicationVersion version("parquet-mr version .");
+
+  ASSERT_EQ("parquet-mr", version.application_);
+  ASSERT_EQ("", version.build_);
+  ASSERT_EQ(0, version.version.major);
+  ASSERT_EQ(0, version.version.minor);
+  ASSERT_EQ(0, version.version.patch);
+  ASSERT_EQ("", version.version.unknown);
+  ASSERT_EQ("", version.version.pre_release);
+  ASSERT_EQ("", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionNoMinor) {
+  ApplicationVersion version("parquet-mr version 1.");
+
+  ASSERT_EQ("parquet-mr", version.application_);
+  ASSERT_EQ("", version.build_);
+  ASSERT_EQ(1, version.version.major);
+  ASSERT_EQ(0, version.version.minor);
+  ASSERT_EQ(0, version.version.patch);
+  ASSERT_EQ("", version.version.unknown);
+  ASSERT_EQ("", version.version.pre_release);
+  ASSERT_EQ("", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionNoPatch) {
+  ApplicationVersion version("parquet-mr version 1.7.");
+
+  ASSERT_EQ("parquet-mr", version.application_);
+  ASSERT_EQ("", version.build_);
+  ASSERT_EQ(1, version.version.major);
+  ASSERT_EQ(7, version.version.minor);
+  ASSERT_EQ(0, version.version.patch);
+  ASSERT_EQ("", version.version.unknown);
+  ASSERT_EQ("", version.version.pre_release);
+  ASSERT_EQ("", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionNoUnknown) {
+  ApplicationVersion version("parquet-mr version 1.7.9-cdh5.5.0+cd");
+
+  ASSERT_EQ("parquet-mr", version.application_);
+  ASSERT_EQ("", version.build_);
+  ASSERT_EQ(1, version.version.major);
+  ASSERT_EQ(7, version.version.minor);
+  ASSERT_EQ(9, version.version.patch);
+  ASSERT_EQ("", version.version.unknown);
+  ASSERT_EQ("cdh5.5.0", version.version.pre_release);
+  ASSERT_EQ("cd", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionNoPreRelease) {
+  ApplicationVersion version("parquet-mr version 1.7.9ab+cd");
+
+  ASSERT_EQ("parquet-mr", version.application_);
+  ASSERT_EQ("", version.build_);
+  ASSERT_EQ(1, version.version.major);
+  ASSERT_EQ(7, version.version.minor);
+  ASSERT_EQ(9, version.version.patch);
+  ASSERT_EQ("ab", version.version.unknown);
+  ASSERT_EQ("", version.version.pre_release);
+  ASSERT_EQ("cd", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionNoUnknownNoPreRelease) {
+  ApplicationVersion version("parquet-mr version 1.7.9+cd");
+
+  ASSERT_EQ("parquet-mr", version.application_);
+  ASSERT_EQ("", version.build_);
+  ASSERT_EQ(1, version.version.major);
+  ASSERT_EQ(7, version.version.minor);
+  ASSERT_EQ(9, version.version.patch);
+  ASSERT_EQ("", version.version.unknown);
+  ASSERT_EQ("", version.version.pre_release);
+  ASSERT_EQ("cd", version.version.build_info);
+}
+
+TEST(ApplicationVersion, FullWithSpaces) {
+  ApplicationVersion version(
+      " parquet-mr  version  1.5.3ab-cdh5.5.0+cd  (build  abcd ) ");
+
+  ASSERT_EQ("parquet-mr", version.application_);
+  ASSERT_EQ("abcd", version.build_);
+  ASSERT_EQ(1, version.version.major);
+  ASSERT_EQ(5, version.version.minor);
+  ASSERT_EQ(3, version.version.patch);
+  ASSERT_EQ("ab", version.version.unknown);
+  ASSERT_EQ("cdh5.5.0", version.version.pre_release);
+  ASSERT_EQ("cd", version.version.build_info);
+}
+

Review comment:
       I think assuming ASCII is ok.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] kou commented on a change in pull request #9367: ARROW-7288: [C++][Parquet] Don't use regular expression to parse application version

Posted by GitBox <gi...@apache.org>.
kou commented on a change in pull request #9367:
URL: https://github.com/apache/arrow/pull/9367#discussion_r567323043



##########
File path: cpp/src/parquet/metadata.cc
##########
@@ -949,43 +910,278 @@ ApplicationVersion::ApplicationVersion(std::string application, int major, int m
                                        int patch)
     : application_(std::move(application)), version{major, minor, patch, "", "", ""} {}
 
-ApplicationVersion::ApplicationVersion(const std::string& created_by) {
-  // Use singletons to compile only once (ARROW-9863)
-  static regex app_regex{ApplicationVersion::APPLICATION_FORMAT};
-  static regex ver_regex{ApplicationVersion::VERSION_FORMAT};
-  smatch app_matches;
-  smatch ver_matches;
-
-  std::string created_by_lower = created_by;
-  std::transform(created_by_lower.begin(), created_by_lower.end(),
-                 created_by_lower.begin(), ::tolower);
-
-  bool app_success = regex_match(created_by_lower, app_matches, app_regex);
-  bool ver_success = false;
-  std::string version_str;
-
-  if (app_success && app_matches.size() >= 4) {
-    // first match is the entire string. sub-matches start from second.
-    application_ = app_matches[1];
-    version_str = app_matches[3];
-    build_ = app_matches[4];
-    ver_success = regex_match(version_str, ver_matches, ver_regex);
-  } else {
-    application_ = "unknown";
-  }
-
-  if (ver_success && ver_matches.size() >= 7) {
-    version.major = atoi(ver_matches[1].str().c_str());
-    version.minor = atoi(ver_matches[2].str().c_str());
-    version.patch = atoi(ver_matches[3].str().c_str());
-    version.unknown = ver_matches[4].str();
-    version.pre_release = ver_matches[5].str();
-    version.build_info = ver_matches[6].str();
-  } else {
-    version.major = 0;
-    version.minor = 0;
-    version.patch = 0;
+namespace {
+// Parse the application version format and set parsed values to
+// ApplicationVersion.
+//
+// The application version format:
+//   "${APPLICATION_NAME}"
+//   "${APPLICATION_NAME} version ${VERSION}"
+//   "${APPLICATION_NAME} version ${VERSION} (build ${BUILD_NAME})"
+//
+// Eg:
+//   parquet-cpp
+//   parquet-cpp version 1.5.0ab-xyz5.5.0+cd
+//   parquet-cpp version 1.5.0ab-xyz5.5.0+cd (build abcd)
+//
+// The VERSION format:
+//   "${MAJOR}.${MINOR}.${PATCH}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}-${PRE_RELEASE}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}-${PRE_RELEASE}+${BUILD_INFO}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}+${BUILD_INFO}"
+//   "${MAJOR}.${MINOR}.${PATCH}-${PRE_RELEASE}"
+//   "${MAJOR}.${MINOR}.${PATCH}-${PRE_RELEASE}+${BUILD_INFO}"
+//   "${MAJOR}.${MINOR}.${PATCH}+${BUILD_INFO}"
+//

Review comment:
       Thanks.
   I didn't think about parquet-mr's implementation. I've added some links.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou commented on pull request #9367: ARROW-7288: [C++][Parquet] Don't use regular expression to parse application version

Posted by GitBox <gi...@apache.org>.
pitrou commented on pull request #9367:
URL: https://github.com/apache/arrow/pull/9367#issuecomment-771538267


   Travis-CI build: https://travis-ci.com/github/pitrou/arrow/builds/215736951


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] kou commented on a change in pull request #9367: ARROW-7288: [C++][Parquet] Don't use regular expression to parse application version

Posted by GitBox <gi...@apache.org>.
kou commented on a change in pull request #9367:
URL: https://github.com/apache/arrow/pull/9367#discussion_r568335632



##########
File path: cpp/src/parquet/metadata.cc
##########
@@ -949,43 +910,283 @@ ApplicationVersion::ApplicationVersion(std::string application, int major, int m
                                        int patch)
     : application_(std::move(application)), version{major, minor, patch, "", "", ""} {}
 
-ApplicationVersion::ApplicationVersion(const std::string& created_by) {
-  // Use singletons to compile only once (ARROW-9863)
-  static regex app_regex{ApplicationVersion::APPLICATION_FORMAT};
-  static regex ver_regex{ApplicationVersion::VERSION_FORMAT};
-  smatch app_matches;
-  smatch ver_matches;
-
-  std::string created_by_lower = created_by;
-  std::transform(created_by_lower.begin(), created_by_lower.end(),
-                 created_by_lower.begin(), ::tolower);
-
-  bool app_success = regex_match(created_by_lower, app_matches, app_regex);
-  bool ver_success = false;
-  std::string version_str;
-
-  if (app_success && app_matches.size() >= 4) {
-    // first match is the entire string. sub-matches start from second.
-    application_ = app_matches[1];
-    version_str = app_matches[3];
-    build_ = app_matches[4];
-    ver_success = regex_match(version_str, ver_matches, ver_regex);
-  } else {
-    application_ = "unknown";
-  }
-
-  if (ver_success && ver_matches.size() >= 7) {
-    version.major = atoi(ver_matches[1].str().c_str());
-    version.minor = atoi(ver_matches[2].str().c_str());
-    version.patch = atoi(ver_matches[3].str().c_str());
-    version.unknown = ver_matches[4].str();
-    version.pre_release = ver_matches[5].str();
-    version.build_info = ver_matches[6].str();
-  } else {
-    version.major = 0;
-    version.minor = 0;
-    version.patch = 0;
+namespace {
+// Parse the application version format and set parsed values to
+// ApplicationVersion.
+//
+// The application version format must be compatible parquet-mr's
+// one. See also:
+//   * https://github.com/apache/parquet-mr/blob/master/parquet-common/src/main/java/org/apache/parquet/VersionParser.java
+//   * https://github.com/apache/parquet-mr/blob/master/parquet-common/src/main/java/org/apache/parquet/SemanticVersion.java
+//
+// The application version format:
+//   "${APPLICATION_NAME}"
+//   "${APPLICATION_NAME} version ${VERSION}"
+//   "${APPLICATION_NAME} version ${VERSION} (build ${BUILD_NAME})"
+//
+// Eg:
+//   parquet-cpp
+//   parquet-cpp version 1.5.0ab-xyz5.5.0+cd
+//   parquet-cpp version 1.5.0ab-xyz5.5.0+cd (build abcd)
+//
+// The VERSION format:
+//   "${MAJOR}.${MINOR}.${PATCH}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}-${PRE_RELEASE}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}-${PRE_RELEASE}+${BUILD_INFO}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}+${BUILD_INFO}"
+//   "${MAJOR}.${MINOR}.${PATCH}-${PRE_RELEASE}"
+//   "${MAJOR}.${MINOR}.${PATCH}-${PRE_RELEASE}+${BUILD_INFO}"
+//   "${MAJOR}.${MINOR}.${PATCH}+${BUILD_INFO}"
+//
+// Eg:
+//   1.5.0
+//   1.5.0ab
+//   1.5.0ab-cdh5.5.0
+//   1.5.0ab-cdh5.5.0+cd
+//   1.5.0ab+cd
+//   1.5.0-cdh5.5.0
+//   1.5.0-cdh5.5.0+cd
+//   1.5.0+cd
+class ApplicationVersionParser {
+ public:
+  ApplicationVersionParser(const std::string& created_by,
+                           ApplicationVersion& application_version)
+      : created_by_(created_by),
+        application_version_(application_version),
+        digit_("0123456789") {}
+
+  void Parse() {
+    application_version_.application_ = "unknown";
+    application_version_.version = {0, 0, 0, "", "", ""};
+
+    if (!ParseApplicationName()) {
+      return;
+    }
+    if (!ParseVersion()) {
+      return;
+    }
+    if (!ParseBuildName()) {
+      return;
+    }
+  }
+
+ private:
+  void RemovePrecedingSpaces(const std::string& string, size_t& start,
+                             const size_t& end) {
+    while (start < end && string[start] == ' ') {

Review comment:
       I've added `\t`, `\v`, `\r`, `\n` and `\f` to supported whitespace characters but they will not be appeared as you said.

##########
File path: cpp/src/parquet/metadata.cc
##########
@@ -949,43 +910,283 @@ ApplicationVersion::ApplicationVersion(std::string application, int major, int m
                                        int patch)
     : application_(std::move(application)), version{major, minor, patch, "", "", ""} {}
 
-ApplicationVersion::ApplicationVersion(const std::string& created_by) {
-  // Use singletons to compile only once (ARROW-9863)
-  static regex app_regex{ApplicationVersion::APPLICATION_FORMAT};
-  static regex ver_regex{ApplicationVersion::VERSION_FORMAT};
-  smatch app_matches;
-  smatch ver_matches;
-
-  std::string created_by_lower = created_by;
-  std::transform(created_by_lower.begin(), created_by_lower.end(),
-                 created_by_lower.begin(), ::tolower);
-
-  bool app_success = regex_match(created_by_lower, app_matches, app_regex);
-  bool ver_success = false;
-  std::string version_str;
-
-  if (app_success && app_matches.size() >= 4) {
-    // first match is the entire string. sub-matches start from second.
-    application_ = app_matches[1];
-    version_str = app_matches[3];
-    build_ = app_matches[4];
-    ver_success = regex_match(version_str, ver_matches, ver_regex);
-  } else {
-    application_ = "unknown";
-  }
-
-  if (ver_success && ver_matches.size() >= 7) {
-    version.major = atoi(ver_matches[1].str().c_str());
-    version.minor = atoi(ver_matches[2].str().c_str());
-    version.patch = atoi(ver_matches[3].str().c_str());
-    version.unknown = ver_matches[4].str();
-    version.pre_release = ver_matches[5].str();
-    version.build_info = ver_matches[6].str();
-  } else {
-    version.major = 0;
-    version.minor = 0;
-    version.patch = 0;
+namespace {
+// Parse the application version format and set parsed values to
+// ApplicationVersion.
+//
+// The application version format must be compatible parquet-mr's
+// one. See also:
+//   * https://github.com/apache/parquet-mr/blob/master/parquet-common/src/main/java/org/apache/parquet/VersionParser.java
+//   * https://github.com/apache/parquet-mr/blob/master/parquet-common/src/main/java/org/apache/parquet/SemanticVersion.java
+//
+// The application version format:
+//   "${APPLICATION_NAME}"
+//   "${APPLICATION_NAME} version ${VERSION}"
+//   "${APPLICATION_NAME} version ${VERSION} (build ${BUILD_NAME})"
+//
+// Eg:
+//   parquet-cpp
+//   parquet-cpp version 1.5.0ab-xyz5.5.0+cd
+//   parquet-cpp version 1.5.0ab-xyz5.5.0+cd (build abcd)
+//
+// The VERSION format:
+//   "${MAJOR}.${MINOR}.${PATCH}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}-${PRE_RELEASE}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}-${PRE_RELEASE}+${BUILD_INFO}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}+${BUILD_INFO}"
+//   "${MAJOR}.${MINOR}.${PATCH}-${PRE_RELEASE}"
+//   "${MAJOR}.${MINOR}.${PATCH}-${PRE_RELEASE}+${BUILD_INFO}"
+//   "${MAJOR}.${MINOR}.${PATCH}+${BUILD_INFO}"
+//
+// Eg:
+//   1.5.0
+//   1.5.0ab
+//   1.5.0ab-cdh5.5.0
+//   1.5.0ab-cdh5.5.0+cd
+//   1.5.0ab+cd
+//   1.5.0-cdh5.5.0
+//   1.5.0-cdh5.5.0+cd
+//   1.5.0+cd
+class ApplicationVersionParser {
+ public:
+  ApplicationVersionParser(const std::string& created_by,
+                           ApplicationVersion& application_version)
+      : created_by_(created_by),
+        application_version_(application_version),
+        digit_("0123456789") {}
+
+  void Parse() {
+    application_version_.application_ = "unknown";
+    application_version_.version = {0, 0, 0, "", "", ""};
+
+    if (!ParseApplicationName()) {
+      return;
+    }
+    if (!ParseVersion()) {
+      return;
+    }
+    if (!ParseBuildName()) {
+      return;
+    }
+  }
+
+ private:
+  void RemovePrecedingSpaces(const std::string& string, size_t& start,
+                             const size_t& end) {
+    while (start < end && string[start] == ' ') {
+      ++start;
+    }
+  }
+
+  void RemoveTrailingSpaces(const std::string& string, const size_t& start, size_t& end) {
+    while (start < (end - 1) && (end - 1) < string.size() && string[end - 1] == ' ') {
+      --end;
+    }
   }
+
+  bool ParseApplicationName() {
+    std::string version_mark(" version ");
+    auto version_mark_position = created_by_.find(version_mark);
+    size_t application_name_end;
+    // No VERSION and BUILD_NAME.
+    if (version_mark_position == std::string::npos) {
+      version_start_ = std::string::npos;
+      application_name_end = created_by_.size();
+    } else {
+      version_start_ = version_mark_position + version_mark.size();
+      application_name_end = version_mark_position;
+    }
+
+    size_t application_name_start = 0;
+    RemovePrecedingSpaces(created_by_, application_name_start, application_name_end);
+    RemoveTrailingSpaces(created_by_, application_name_start, application_name_end);
+    application_version_.application_ = created_by_.substr(
+        application_name_start, application_name_end - application_name_start);
+
+    return true;
+  }
+
+  bool ParseVersion() {
+    // No VERSION.
+    if (version_start_ == std::string::npos) {
+      return true;
+    }
+
+    RemovePrecedingSpaces(created_by_, version_start_, created_by_.size());
+    version_end_ = created_by_.find(" (", version_start_);
+    // No BUILD_NAME.
+    if (version_end_ == std::string::npos) {
+      version_end_ = created_by_.size();
+    }
+    RemoveTrailingSpaces(created_by_, version_start_, version_end_);
+    // No VERSION.
+    if (version_start_ == version_end_) {
+      return false;
+    }
+    version_string_ = created_by_.substr(version_start_, version_end_ - version_start_);
+
+    if (!ParseVersionMajor()) {
+      return false;
+    }
+    if (!ParseVersionMinor()) {
+      return false;
+    }
+    if (!ParseVersionPatch()) {
+      return false;
+    }
+    if (!ParseVersionUnknown()) {
+      return false;
+    }
+    if (!ParseVersionPreRelease()) {
+      return false;
+    }
+    if (!ParseVersionBuildInfo()) {
+      return false;
+    }
+
+    return true;
+  }
+
+  bool ParseVersionMajor() {
+    size_t version_major_start = 0;
+    auto version_major_end = version_string_.find_first_not_of(digit_);
+    // No ".".
+    if (version_major_end == std::string::npos ||

Review comment:
       We can support `${MAJOR}` only case but it's not accepted in the original implementation. So I rejected the case.
   
   But I've added support for `${MAJOR}` because we can be liberal for input.

##########
File path: cpp/src/parquet/metadata.h
##########
@@ -58,16 +58,6 @@ class PARQUET_EXPORT ApplicationVersion {
   static const ApplicationVersion& PARQUET_816_FIXED_VERSION();
   static const ApplicationVersion& PARQUET_CPP_FIXED_STATS_VERSION();
   static const ApplicationVersion& PARQUET_MR_FIXED_STATS_VERSION();
-  // Regular expression for the version format
-  // major . minor . patch unknown - prerelease.x + build info
-  // Eg: 1.5.0ab-cdh5.5.0+cd
-  static constexpr char const* VERSION_FORMAT =
-      "^(\\d+)\\.(\\d+)\\.(\\d+)([^-+]*)?(?:-([^+]*))?(?:\\+(.*))?$";
-  // Regular expression for the application format
-  // application_name version VERSION_FORMAT (build build_name)
-  // Eg: parquet-cpp version 1.5.0ab-xyz5.5.0+cd (build abcd)
-  static constexpr char const* APPLICATION_FORMAT =
-      "(.*?)\\s*(?:(version\\s*(?:([^(]*?)\\s*(?:\\(\\s*build\\s*([^)]*?)\\s*\\))?)?)?)";
 

Review comment:
       Good catch!
   I've removed.

##########
File path: cpp/src/parquet/metadata_test.cc
##########
@@ -342,5 +342,123 @@ TEST(ApplicationVersion, Basics) {
       version1.HasCorrectStatistics(Type::BYTE_ARRAY, stats_int, SortOrder::UNSIGNED));
 }
 
+TEST(ApplicationVersion, Empty) {
+  ApplicationVersion version("");
+
+  ASSERT_EQ("", version.application_);
+  ASSERT_EQ("", version.build_);
+  ASSERT_EQ(0, version.version.major);
+  ASSERT_EQ(0, version.version.minor);
+  ASSERT_EQ(0, version.version.patch);
+  ASSERT_EQ("", version.version.unknown);
+  ASSERT_EQ("", version.version.pre_release);
+  ASSERT_EQ("", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionEmpty) {
+  ApplicationVersion version("parquet-mr version ");
+
+  ASSERT_EQ("parquet-mr", version.application_);
+  ASSERT_EQ("", version.build_);
+  ASSERT_EQ(0, version.version.major);
+  ASSERT_EQ(0, version.version.minor);
+  ASSERT_EQ(0, version.version.patch);
+  ASSERT_EQ("", version.version.unknown);
+  ASSERT_EQ("", version.version.pre_release);
+  ASSERT_EQ("", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionNoMajor) {
+  ApplicationVersion version("parquet-mr version .");
+
+  ASSERT_EQ("parquet-mr", version.application_);
+  ASSERT_EQ("", version.build_);
+  ASSERT_EQ(0, version.version.major);
+  ASSERT_EQ(0, version.version.minor);
+  ASSERT_EQ(0, version.version.patch);
+  ASSERT_EQ("", version.version.unknown);
+  ASSERT_EQ("", version.version.pre_release);
+  ASSERT_EQ("", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionNoMinor) {
+  ApplicationVersion version("parquet-mr version 1.");
+
+  ASSERT_EQ("parquet-mr", version.application_);
+  ASSERT_EQ("", version.build_);
+  ASSERT_EQ(1, version.version.major);
+  ASSERT_EQ(0, version.version.minor);
+  ASSERT_EQ(0, version.version.patch);
+  ASSERT_EQ("", version.version.unknown);
+  ASSERT_EQ("", version.version.pre_release);
+  ASSERT_EQ("", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionNoPatch) {
+  ApplicationVersion version("parquet-mr version 1.7.");
+
+  ASSERT_EQ("parquet-mr", version.application_);
+  ASSERT_EQ("", version.build_);
+  ASSERT_EQ(1, version.version.major);
+  ASSERT_EQ(7, version.version.minor);
+  ASSERT_EQ(0, version.version.patch);
+  ASSERT_EQ("", version.version.unknown);
+  ASSERT_EQ("", version.version.pre_release);
+  ASSERT_EQ("", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionNoUnknown) {
+  ApplicationVersion version("parquet-mr version 1.7.9-cdh5.5.0+cd");
+
+  ASSERT_EQ("parquet-mr", version.application_);
+  ASSERT_EQ("", version.build_);
+  ASSERT_EQ(1, version.version.major);
+  ASSERT_EQ(7, version.version.minor);
+  ASSERT_EQ(9, version.version.patch);
+  ASSERT_EQ("", version.version.unknown);
+  ASSERT_EQ("cdh5.5.0", version.version.pre_release);
+  ASSERT_EQ("cd", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionNoPreRelease) {
+  ApplicationVersion version("parquet-mr version 1.7.9ab+cd");
+
+  ASSERT_EQ("parquet-mr", version.application_);
+  ASSERT_EQ("", version.build_);
+  ASSERT_EQ(1, version.version.major);
+  ASSERT_EQ(7, version.version.minor);
+  ASSERT_EQ(9, version.version.patch);
+  ASSERT_EQ("ab", version.version.unknown);
+  ASSERT_EQ("", version.version.pre_release);
+  ASSERT_EQ("cd", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionNoUnknownNoPreRelease) {
+  ApplicationVersion version("parquet-mr version 1.7.9+cd");
+
+  ASSERT_EQ("parquet-mr", version.application_);
+  ASSERT_EQ("", version.build_);
+  ASSERT_EQ(1, version.version.major);
+  ASSERT_EQ(7, version.version.minor);
+  ASSERT_EQ(9, version.version.patch);
+  ASSERT_EQ("", version.version.unknown);
+  ASSERT_EQ("", version.version.pre_release);
+  ASSERT_EQ("cd", version.version.build_info);
+}
+
+TEST(ApplicationVersion, FullWithSpaces) {
+  ApplicationVersion version(
+      " parquet-mr  version  1.5.3ab-cdh5.5.0+cd  (build  abcd ) ");
+
+  ASSERT_EQ("parquet-mr", version.application_);
+  ASSERT_EQ("abcd", version.build_);
+  ASSERT_EQ(1, version.version.major);
+  ASSERT_EQ(5, version.version.minor);
+  ASSERT_EQ(3, version.version.patch);
+  ASSERT_EQ("ab", version.version.unknown);
+  ASSERT_EQ("cdh5.5.0", version.version.pre_release);
+  ASSERT_EQ("cd", version.version.build_info);
+}
+

Review comment:
       I've add some more tests.
   
   Note that this implementation assumes that input encoding is ASCII. It may not work with other encodings.
   Should we support non ASCII encodings?

##########
File path: cpp/src/parquet/metadata.cc
##########
@@ -949,43 +910,283 @@ ApplicationVersion::ApplicationVersion(std::string application, int major, int m
                                        int patch)
     : application_(std::move(application)), version{major, minor, patch, "", "", ""} {}
 
-ApplicationVersion::ApplicationVersion(const std::string& created_by) {
-  // Use singletons to compile only once (ARROW-9863)
-  static regex app_regex{ApplicationVersion::APPLICATION_FORMAT};
-  static regex ver_regex{ApplicationVersion::VERSION_FORMAT};
-  smatch app_matches;
-  smatch ver_matches;
-
-  std::string created_by_lower = created_by;
-  std::transform(created_by_lower.begin(), created_by_lower.end(),
-                 created_by_lower.begin(), ::tolower);
-
-  bool app_success = regex_match(created_by_lower, app_matches, app_regex);
-  bool ver_success = false;
-  std::string version_str;
-
-  if (app_success && app_matches.size() >= 4) {
-    // first match is the entire string. sub-matches start from second.
-    application_ = app_matches[1];
-    version_str = app_matches[3];
-    build_ = app_matches[4];
-    ver_success = regex_match(version_str, ver_matches, ver_regex);
-  } else {
-    application_ = "unknown";
-  }
-
-  if (ver_success && ver_matches.size() >= 7) {
-    version.major = atoi(ver_matches[1].str().c_str());
-    version.minor = atoi(ver_matches[2].str().c_str());
-    version.patch = atoi(ver_matches[3].str().c_str());
-    version.unknown = ver_matches[4].str();
-    version.pre_release = ver_matches[5].str();
-    version.build_info = ver_matches[6].str();
-  } else {
-    version.major = 0;
-    version.minor = 0;
-    version.patch = 0;
+namespace {
+// Parse the application version format and set parsed values to
+// ApplicationVersion.
+//
+// The application version format must be compatible parquet-mr's
+// one. See also:
+//   * https://github.com/apache/parquet-mr/blob/master/parquet-common/src/main/java/org/apache/parquet/VersionParser.java
+//   * https://github.com/apache/parquet-mr/blob/master/parquet-common/src/main/java/org/apache/parquet/SemanticVersion.java
+//
+// The application version format:
+//   "${APPLICATION_NAME}"
+//   "${APPLICATION_NAME} version ${VERSION}"
+//   "${APPLICATION_NAME} version ${VERSION} (build ${BUILD_NAME})"
+//
+// Eg:
+//   parquet-cpp
+//   parquet-cpp version 1.5.0ab-xyz5.5.0+cd
+//   parquet-cpp version 1.5.0ab-xyz5.5.0+cd (build abcd)
+//
+// The VERSION format:
+//   "${MAJOR}.${MINOR}.${PATCH}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}-${PRE_RELEASE}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}-${PRE_RELEASE}+${BUILD_INFO}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}+${BUILD_INFO}"
+//   "${MAJOR}.${MINOR}.${PATCH}-${PRE_RELEASE}"
+//   "${MAJOR}.${MINOR}.${PATCH}-${PRE_RELEASE}+${BUILD_INFO}"
+//   "${MAJOR}.${MINOR}.${PATCH}+${BUILD_INFO}"
+//
+// Eg:
+//   1.5.0
+//   1.5.0ab
+//   1.5.0ab-cdh5.5.0
+//   1.5.0ab-cdh5.5.0+cd
+//   1.5.0ab+cd
+//   1.5.0-cdh5.5.0
+//   1.5.0-cdh5.5.0+cd
+//   1.5.0+cd
+class ApplicationVersionParser {
+ public:
+  ApplicationVersionParser(const std::string& created_by,
+                           ApplicationVersion& application_version)
+      : created_by_(created_by),
+        application_version_(application_version),
+        digit_("0123456789") {}
+
+  void Parse() {
+    application_version_.application_ = "unknown";
+    application_version_.version = {0, 0, 0, "", "", ""};
+
+    if (!ParseApplicationName()) {
+      return;
+    }
+    if (!ParseVersion()) {
+      return;
+    }
+    if (!ParseBuildName()) {
+      return;
+    }
+  }
+
+ private:
+  void RemovePrecedingSpaces(const std::string& string, size_t& start,
+                             const size_t& end) {
+    while (start < end && string[start] == ' ') {
+      ++start;
+    }
+  }
+
+  void RemoveTrailingSpaces(const std::string& string, const size_t& start, size_t& end) {
+    while (start < (end - 1) && (end - 1) < string.size() && string[end - 1] == ' ') {
+      --end;
+    }
   }
+
+  bool ParseApplicationName() {
+    std::string version_mark(" version ");
+    auto version_mark_position = created_by_.find(version_mark);
+    size_t application_name_end;
+    // No VERSION and BUILD_NAME.
+    if (version_mark_position == std::string::npos) {
+      version_start_ = std::string::npos;
+      application_name_end = created_by_.size();
+    } else {
+      version_start_ = version_mark_position + version_mark.size();
+      application_name_end = version_mark_position;
+    }
+
+    size_t application_name_start = 0;
+    RemovePrecedingSpaces(created_by_, application_name_start, application_name_end);
+    RemoveTrailingSpaces(created_by_, application_name_start, application_name_end);
+    application_version_.application_ = created_by_.substr(
+        application_name_start, application_name_end - application_name_start);
+
+    return true;
+  }
+
+  bool ParseVersion() {
+    // No VERSION.
+    if (version_start_ == std::string::npos) {
+      return true;
+    }
+
+    RemovePrecedingSpaces(created_by_, version_start_, created_by_.size());
+    version_end_ = created_by_.find(" (", version_start_);
+    // No BUILD_NAME.
+    if (version_end_ == std::string::npos) {
+      version_end_ = created_by_.size();
+    }
+    RemoveTrailingSpaces(created_by_, version_start_, version_end_);
+    // No VERSION.
+    if (version_start_ == version_end_) {
+      return false;
+    }
+    version_string_ = created_by_.substr(version_start_, version_end_ - version_start_);
+
+    if (!ParseVersionMajor()) {
+      return false;
+    }
+    if (!ParseVersionMinor()) {
+      return false;
+    }
+    if (!ParseVersionPatch()) {
+      return false;
+    }
+    if (!ParseVersionUnknown()) {
+      return false;
+    }
+    if (!ParseVersionPreRelease()) {
+      return false;
+    }
+    if (!ParseVersionBuildInfo()) {
+      return false;
+    }
+
+    return true;
+  }
+
+  bool ParseVersionMajor() {
+    size_t version_major_start = 0;
+    auto version_major_end = version_string_.find_first_not_of(digit_);
+    // No ".".
+    if (version_major_end == std::string::npos ||
+        version_string_[version_major_end] != '.') {
+      return false;
+    }
+    // No MAJOR.
+    if (version_major_end == version_major_start) {
+      return false;
+    }
+    auto version_major_string = version_string_.substr(
+        version_major_start, version_major_end - version_major_start);
+    application_version_.version.major = atoi(version_major_string.c_str());
+    version_parsing_position_ = version_major_end + 1;  // +1 is for '.'.
+    return true;
+  }
+
+  bool ParseVersionMinor() {
+    auto version_minor_start = version_parsing_position_;
+    auto version_minor_end =
+        version_string_.find_first_not_of(digit_, version_minor_start);
+    if (version_minor_end == std::string::npos ||

Review comment:
       Ditto.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou closed pull request #9367: ARROW-7288: [C++][Parquet] Don't use regular expression to parse application version

Posted by GitBox <gi...@apache.org>.
pitrou closed pull request #9367:
URL: https://github.com/apache/arrow/pull/9367


   


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] kou commented on a change in pull request #9367: ARROW-7288: [C++][Parquet] Don't use regular expression to parse application version

Posted by GitBox <gi...@apache.org>.
kou commented on a change in pull request #9367:
URL: https://github.com/apache/arrow/pull/9367#discussion_r567323181



##########
File path: cpp/cmake_modules/ThirdpartyToolchain.cmake
##########
@@ -854,13 +854,6 @@ else()
   set(THRIFT_REQUIRES_BOOST FALSE)
 endif()
 
-# Parquet requires boost only with gcc 4.8 (because of missing std::regex).
-if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "4.9")
-  set(PARQUET_REQUIRES_BOOST TRUE)
-else()
-  set(PARQUET_REQUIRES_BOOST FALSE)
-endif()
-

Review comment:
       Thanks.
   I didn't notice that we need boost::regex only for this feature. I've removed boost::regex dependency entirely.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou closed pull request #9367: ARROW-7288: [C++][Parquet] Don't use regular expression to parse application version

Posted by GitBox <gi...@apache.org>.
pitrou closed pull request #9367:
URL: https://github.com/apache/arrow/pull/9367


   


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou commented on pull request #9367: ARROW-7288: [C++][Parquet] Don't use regular expression to parse application version

Posted by GitBox <gi...@apache.org>.
pitrou commented on pull request #9367:
URL: https://github.com/apache/arrow/pull/9367#issuecomment-771538267


   Travis-CI build: https://travis-ci.com/github/pitrou/arrow/builds/215736951


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] kou commented on pull request #9367: ARROW-7288: [C++][Parquet] Don't use regular expression to parse application version

Posted by GitBox <gi...@apache.org>.
kou commented on pull request #9367:
URL: https://github.com/apache/arrow/pull/9367#issuecomment-770286711


   @github-actions crossbow submit -g nightly


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] kou commented on pull request #9367: ARROW-7288: [C++][Parquet] Don't use regular expression to parse application version

Posted by GitBox <gi...@apache.org>.
kou commented on pull request #9367:
URL: https://github.com/apache/arrow/pull/9367#issuecomment-771944195


   Thanks for the boolean fix!


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] kou commented on pull request #9367: ARROW-7288: [C++][Parquet] Don't use regular expression to parse application version

Posted by GitBox <gi...@apache.org>.
kou commented on pull request #9367:
URL: https://github.com/apache/arrow/pull/9367#issuecomment-771944195


   Thanks for the boolean fix!


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou commented on a change in pull request #9367: ARROW-7288: [C++][Parquet] Don't use regular expression to parse application version

Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #9367:
URL: https://github.com/apache/arrow/pull/9367#discussion_r568468397



##########
File path: cpp/src/parquet/metadata_test.cc
##########
@@ -342,5 +342,123 @@ TEST(ApplicationVersion, Basics) {
       version1.HasCorrectStatistics(Type::BYTE_ARRAY, stats_int, SortOrder::UNSIGNED));
 }
 
+TEST(ApplicationVersion, Empty) {
+  ApplicationVersion version("");
+
+  ASSERT_EQ("", version.application_);
+  ASSERT_EQ("", version.build_);
+  ASSERT_EQ(0, version.version.major);
+  ASSERT_EQ(0, version.version.minor);
+  ASSERT_EQ(0, version.version.patch);
+  ASSERT_EQ("", version.version.unknown);
+  ASSERT_EQ("", version.version.pre_release);
+  ASSERT_EQ("", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionEmpty) {
+  ApplicationVersion version("parquet-mr version ");
+
+  ASSERT_EQ("parquet-mr", version.application_);
+  ASSERT_EQ("", version.build_);
+  ASSERT_EQ(0, version.version.major);
+  ASSERT_EQ(0, version.version.minor);
+  ASSERT_EQ(0, version.version.patch);
+  ASSERT_EQ("", version.version.unknown);
+  ASSERT_EQ("", version.version.pre_release);
+  ASSERT_EQ("", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionNoMajor) {
+  ApplicationVersion version("parquet-mr version .");
+
+  ASSERT_EQ("parquet-mr", version.application_);
+  ASSERT_EQ("", version.build_);
+  ASSERT_EQ(0, version.version.major);
+  ASSERT_EQ(0, version.version.minor);
+  ASSERT_EQ(0, version.version.patch);
+  ASSERT_EQ("", version.version.unknown);
+  ASSERT_EQ("", version.version.pre_release);
+  ASSERT_EQ("", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionNoMinor) {
+  ApplicationVersion version("parquet-mr version 1.");
+
+  ASSERT_EQ("parquet-mr", version.application_);
+  ASSERT_EQ("", version.build_);
+  ASSERT_EQ(1, version.version.major);
+  ASSERT_EQ(0, version.version.minor);
+  ASSERT_EQ(0, version.version.patch);
+  ASSERT_EQ("", version.version.unknown);
+  ASSERT_EQ("", version.version.pre_release);
+  ASSERT_EQ("", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionNoPatch) {
+  ApplicationVersion version("parquet-mr version 1.7.");
+
+  ASSERT_EQ("parquet-mr", version.application_);
+  ASSERT_EQ("", version.build_);
+  ASSERT_EQ(1, version.version.major);
+  ASSERT_EQ(7, version.version.minor);
+  ASSERT_EQ(0, version.version.patch);
+  ASSERT_EQ("", version.version.unknown);
+  ASSERT_EQ("", version.version.pre_release);
+  ASSERT_EQ("", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionNoUnknown) {
+  ApplicationVersion version("parquet-mr version 1.7.9-cdh5.5.0+cd");
+
+  ASSERT_EQ("parquet-mr", version.application_);
+  ASSERT_EQ("", version.build_);
+  ASSERT_EQ(1, version.version.major);
+  ASSERT_EQ(7, version.version.minor);
+  ASSERT_EQ(9, version.version.patch);
+  ASSERT_EQ("", version.version.unknown);
+  ASSERT_EQ("cdh5.5.0", version.version.pre_release);
+  ASSERT_EQ("cd", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionNoPreRelease) {
+  ApplicationVersion version("parquet-mr version 1.7.9ab+cd");
+
+  ASSERT_EQ("parquet-mr", version.application_);
+  ASSERT_EQ("", version.build_);
+  ASSERT_EQ(1, version.version.major);
+  ASSERT_EQ(7, version.version.minor);
+  ASSERT_EQ(9, version.version.patch);
+  ASSERT_EQ("ab", version.version.unknown);
+  ASSERT_EQ("", version.version.pre_release);
+  ASSERT_EQ("cd", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionNoUnknownNoPreRelease) {
+  ApplicationVersion version("parquet-mr version 1.7.9+cd");
+
+  ASSERT_EQ("parquet-mr", version.application_);
+  ASSERT_EQ("", version.build_);
+  ASSERT_EQ(1, version.version.major);
+  ASSERT_EQ(7, version.version.minor);
+  ASSERT_EQ(9, version.version.patch);
+  ASSERT_EQ("", version.version.unknown);
+  ASSERT_EQ("", version.version.pre_release);
+  ASSERT_EQ("cd", version.version.build_info);
+}
+
+TEST(ApplicationVersion, FullWithSpaces) {
+  ApplicationVersion version(
+      " parquet-mr  version  1.5.3ab-cdh5.5.0+cd  (build  abcd ) ");
+
+  ASSERT_EQ("parquet-mr", version.application_);
+  ASSERT_EQ("abcd", version.build_);
+  ASSERT_EQ(1, version.version.major);
+  ASSERT_EQ(5, version.version.minor);
+  ASSERT_EQ(3, version.version.patch);
+  ASSERT_EQ("ab", version.version.unknown);
+  ASSERT_EQ("cdh5.5.0", version.version.pre_release);
+  ASSERT_EQ("cd", version.version.build_info);
+}
+

Review comment:
       I think assuming ASCII is ok.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] nealrichardson commented on a change in pull request #9367: ARROW-7288: [C++][Parquet] Don't use regular expression to parse application version

Posted by GitBox <gi...@apache.org>.
nealrichardson commented on a change in pull request #9367:
URL: https://github.com/apache/arrow/pull/9367#discussion_r567267561



##########
File path: cpp/cmake_modules/ThirdpartyToolchain.cmake
##########
@@ -854,13 +854,6 @@ else()
   set(THRIFT_REQUIRES_BOOST FALSE)
 endif()
 
-# Parquet requires boost only with gcc 4.8 (because of missing std::regex).
-if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "4.9")
-  set(PARQUET_REQUIRES_BOOST TRUE)
-else()
-  set(PARQUET_REQUIRES_BOOST FALSE)
-endif()
-

Review comment:
       Should also remove the lines with `BOOST_REGEX_LIBRARY` below, right? Nothing else uses boost::regex in our code. 
   
   We can also remove regex.hpp from cpp/build-support/trim-boost.sh. That should probably be a followup JIRA, but could you edit the comment on L38 of that script to say that we actually don't need it anymore?

##########
File path: cpp/src/parquet/metadata.cc
##########
@@ -949,43 +910,278 @@ ApplicationVersion::ApplicationVersion(std::string application, int major, int m
                                        int patch)
     : application_(std::move(application)), version{major, minor, patch, "", "", ""} {}
 
-ApplicationVersion::ApplicationVersion(const std::string& created_by) {
-  // Use singletons to compile only once (ARROW-9863)
-  static regex app_regex{ApplicationVersion::APPLICATION_FORMAT};
-  static regex ver_regex{ApplicationVersion::VERSION_FORMAT};
-  smatch app_matches;
-  smatch ver_matches;
-
-  std::string created_by_lower = created_by;
-  std::transform(created_by_lower.begin(), created_by_lower.end(),
-                 created_by_lower.begin(), ::tolower);
-
-  bool app_success = regex_match(created_by_lower, app_matches, app_regex);
-  bool ver_success = false;
-  std::string version_str;
-
-  if (app_success && app_matches.size() >= 4) {
-    // first match is the entire string. sub-matches start from second.
-    application_ = app_matches[1];
-    version_str = app_matches[3];
-    build_ = app_matches[4];
-    ver_success = regex_match(version_str, ver_matches, ver_regex);
-  } else {
-    application_ = "unknown";
-  }
-
-  if (ver_success && ver_matches.size() >= 7) {
-    version.major = atoi(ver_matches[1].str().c_str());
-    version.minor = atoi(ver_matches[2].str().c_str());
-    version.patch = atoi(ver_matches[3].str().c_str());
-    version.unknown = ver_matches[4].str();
-    version.pre_release = ver_matches[5].str();
-    version.build_info = ver_matches[6].str();
-  } else {
-    version.major = 0;
-    version.minor = 0;
-    version.patch = 0;
+namespace {
+// Parse the application version format and set parsed values to
+// ApplicationVersion.
+//
+// The application version format:
+//   "${APPLICATION_NAME}"
+//   "${APPLICATION_NAME} version ${VERSION}"
+//   "${APPLICATION_NAME} version ${VERSION} (build ${BUILD_NAME})"
+//
+// Eg:
+//   parquet-cpp
+//   parquet-cpp version 1.5.0ab-xyz5.5.0+cd
+//   parquet-cpp version 1.5.0ab-xyz5.5.0+cd (build abcd)
+//
+// The VERSION format:
+//   "${MAJOR}.${MINOR}.${PATCH}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}-${PRE_RELEASE}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}-${PRE_RELEASE}+${BUILD_INFO}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}+${BUILD_INFO}"
+//   "${MAJOR}.${MINOR}.${PATCH}-${PRE_RELEASE}"
+//   "${MAJOR}.${MINOR}.${PATCH}-${PRE_RELEASE}+${BUILD_INFO}"
+//   "${MAJOR}.${MINOR}.${PATCH}+${BUILD_INFO}"
+//

Review comment:
       Perhaps link to where parquet-mr does versioning, for reference? https://github.com/apache/parquet-mr/blob/master/parquet-common/src/main/java/org/apache/parquet/SemanticVersion.java#L31-L39




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] github-actions[bot] commented on pull request #9367: ARROW-7288: [C++][Parquet] Don't use regular expression to parse application version

Posted by GitBox <gi...@apache.org>.
github-actions[bot] commented on pull request #9367:
URL: https://github.com/apache/arrow/pull/9367#issuecomment-770136039


   https://issues.apache.org/jira/browse/ARROW-7288


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] nealrichardson commented on a change in pull request #9367: ARROW-7288: [C++][Parquet] Don't use regular expression to parse application version

Posted by GitBox <gi...@apache.org>.
nealrichardson commented on a change in pull request #9367:
URL: https://github.com/apache/arrow/pull/9367#discussion_r567267561



##########
File path: cpp/cmake_modules/ThirdpartyToolchain.cmake
##########
@@ -854,13 +854,6 @@ else()
   set(THRIFT_REQUIRES_BOOST FALSE)
 endif()
 
-# Parquet requires boost only with gcc 4.8 (because of missing std::regex).
-if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "4.9")
-  set(PARQUET_REQUIRES_BOOST TRUE)
-else()
-  set(PARQUET_REQUIRES_BOOST FALSE)
-endif()
-

Review comment:
       Should also remove the lines with `BOOST_REGEX_LIBRARY` below, right? Nothing else uses boost::regex in our code. 
   
   We can also remove regex.hpp from cpp/build-support/trim-boost.sh and rebuild our trimmed bundle without it. That should probably be a followup JIRA, but could you edit the comment on L38 of that script to say that we actually don't need it anymore?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] github-actions[bot] commented on pull request #9367: ARROW-7288: [C++][Parquet] Don't use regular expression to parse application version

Posted by GitBox <gi...@apache.org>.
github-actions[bot] commented on pull request #9367:
URL: https://github.com/apache/arrow/pull/9367#issuecomment-770286802


   Revision: 076863b5677d2cc1e1e700bd009ea847ea0b9a85
   
   Submitted crossbow builds: [ursacomputing/crossbow @ actions-59](https://github.com/ursacomputing/crossbow/branches/all?query=actions-59)
   
   |Task|Status|
   |----|------|
   |centos-7-amd64|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-centos-7-amd64)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-centos-7-amd64)|
   |centos-8-amd64|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-centos-8-amd64)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-centos-8-amd64)|
   |conda-clean|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-59-azure-conda-clean)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-59-azure-conda-clean)|
   |conda-linux-gcc-py36-aarch64|[![Drone](https://img.shields.io/drone/build/ursacomputing/crossbow/actions-59-drone-conda-linux-gcc-py36-aarch64.svg)](https://cloud.drone.io/ursacomputing/crossbow)|
   |conda-linux-gcc-py36-cpu-r36|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-59-azure-conda-linux-gcc-py36-cpu-r36)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-59-azure-conda-linux-gcc-py36-cpu-r36)|
   |conda-linux-gcc-py36-cuda|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-59-azure-conda-linux-gcc-py36-cuda)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-59-azure-conda-linux-gcc-py36-cuda)|
   |conda-linux-gcc-py37-aarch64|[![Drone](https://img.shields.io/drone/build/ursacomputing/crossbow/actions-59-drone-conda-linux-gcc-py37-aarch64.svg)](https://cloud.drone.io/ursacomputing/crossbow)|
   |conda-linux-gcc-py37-cpu-r40|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-59-azure-conda-linux-gcc-py37-cpu-r40)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-59-azure-conda-linux-gcc-py37-cpu-r40)|
   |conda-linux-gcc-py37-cuda|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-59-azure-conda-linux-gcc-py37-cuda)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-59-azure-conda-linux-gcc-py37-cuda)|
   |conda-linux-gcc-py38-aarch64|[![Drone](https://img.shields.io/drone/build/ursacomputing/crossbow/actions-59-drone-conda-linux-gcc-py38-aarch64.svg)](https://cloud.drone.io/ursacomputing/crossbow)|
   |conda-linux-gcc-py38-cpu|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-59-azure-conda-linux-gcc-py38-cpu)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-59-azure-conda-linux-gcc-py38-cpu)|
   |conda-linux-gcc-py38-cuda|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-59-azure-conda-linux-gcc-py38-cuda)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-59-azure-conda-linux-gcc-py38-cuda)|
   |conda-linux-gcc-py39-aarch64|[![Drone](https://img.shields.io/drone/build/ursacomputing/crossbow/actions-59-drone-conda-linux-gcc-py39-aarch64.svg)](https://cloud.drone.io/ursacomputing/crossbow)|
   |conda-linux-gcc-py39-cpu|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-59-azure-conda-linux-gcc-py39-cpu)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-59-azure-conda-linux-gcc-py39-cpu)|
   |conda-linux-gcc-py39-cuda|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-59-azure-conda-linux-gcc-py39-cuda)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-59-azure-conda-linux-gcc-py39-cuda)|
   |conda-osx-clang-py36-r36|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-59-azure-conda-osx-clang-py36-r36)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-59-azure-conda-osx-clang-py36-r36)|
   |conda-osx-clang-py37-r40|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-59-azure-conda-osx-clang-py37-r40)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-59-azure-conda-osx-clang-py37-r40)|
   |conda-osx-clang-py38|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-59-azure-conda-osx-clang-py38)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-59-azure-conda-osx-clang-py38)|
   |conda-osx-clang-py39|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-59-azure-conda-osx-clang-py39)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-59-azure-conda-osx-clang-py39)|
   |conda-win-vs2017-py36-r36|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-59-azure-conda-win-vs2017-py36-r36)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-59-azure-conda-win-vs2017-py36-r36)|
   |conda-win-vs2017-py37-r40|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-59-azure-conda-win-vs2017-py37-r40)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-59-azure-conda-win-vs2017-py37-r40)|
   |conda-win-vs2017-py38|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-59-azure-conda-win-vs2017-py38)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-59-azure-conda-win-vs2017-py38)|
   |debian-buster-amd64|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-debian-buster-amd64)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-debian-buster-amd64)|
   |example-cpp-minimal-build-static|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-example-cpp-minimal-build-static)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-example-cpp-minimal-build-static)|
   |example-cpp-minimal-build-static-system-dependency|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-example-cpp-minimal-build-static-system-dependency)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-example-cpp-minimal-build-static-system-dependency)|
   |gandiva-jar-osx|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-gandiva-jar-osx)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-gandiva-jar-osx)|
   |gandiva-jar-ubuntu|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-gandiva-jar-ubuntu)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-gandiva-jar-ubuntu)|
   |homebrew-cpp|[![TravisCI](https://img.shields.io/travis/ursacomputing/crossbow/actions-59-travis-homebrew-cpp.svg)](https://travis-ci.org/ursacomputing/crossbow/branches)|
   |homebrew-r-autobrew|[![TravisCI](https://img.shields.io/travis/ursacomputing/crossbow/actions-59-travis-homebrew-r-autobrew.svg)](https://travis-ci.org/ursacomputing/crossbow/branches)|
   |nuget|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-nuget)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-nuget)|
   |python-sdist|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-python-sdist)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-python-sdist)|
   |test-conda-cpp|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-test-conda-cpp)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-test-conda-cpp)|
   |test-conda-cpp-valgrind|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-test-conda-cpp-valgrind)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-test-conda-cpp-valgrind)|
   |test-conda-python-3.6|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-test-conda-python-3.6)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-test-conda-python-3.6)|
   |test-conda-python-3.6-pandas-0.23|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-test-conda-python-3.6-pandas-0.23)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-test-conda-python-3.6-pandas-0.23)|
   |test-conda-python-3.7|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-test-conda-python-3.7)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-test-conda-python-3.7)|
   |test-conda-python-3.7-dask-latest|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-test-conda-python-3.7-dask-latest)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-test-conda-python-3.7-dask-latest)|
   |test-conda-python-3.7-hdfs-3.2|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-test-conda-python-3.7-hdfs-3.2)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-test-conda-python-3.7-hdfs-3.2)|
   |test-conda-python-3.7-kartothek-latest|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-test-conda-python-3.7-kartothek-latest)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-test-conda-python-3.7-kartothek-latest)|
   |test-conda-python-3.7-kartothek-master|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-test-conda-python-3.7-kartothek-master)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-test-conda-python-3.7-kartothek-master)|
   |test-conda-python-3.7-pandas-latest|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-test-conda-python-3.7-pandas-latest)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-test-conda-python-3.7-pandas-latest)|
   |test-conda-python-3.7-pandas-master|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-test-conda-python-3.7-pandas-master)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-test-conda-python-3.7-pandas-master)|
   |test-conda-python-3.7-spark-branch-3.0|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-test-conda-python-3.7-spark-branch-3.0)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-test-conda-python-3.7-spark-branch-3.0)|
   |test-conda-python-3.7-turbodbc-latest|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-test-conda-python-3.7-turbodbc-latest)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-test-conda-python-3.7-turbodbc-latest)|
   |test-conda-python-3.7-turbodbc-master|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-test-conda-python-3.7-turbodbc-master)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-test-conda-python-3.7-turbodbc-master)|
   |test-conda-python-3.8|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-test-conda-python-3.8)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-test-conda-python-3.8)|
   |test-conda-python-3.8-dask-master|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-test-conda-python-3.8-dask-master)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-test-conda-python-3.8-dask-master)|
   |test-conda-python-3.8-hypothesis|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-test-conda-python-3.8-hypothesis)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-test-conda-python-3.8-hypothesis)|
   |test-conda-python-3.8-jpype|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-test-conda-python-3.8-jpype)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-test-conda-python-3.8-jpype)|
   |test-conda-python-3.8-pandas-latest|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-test-conda-python-3.8-pandas-latest)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-test-conda-python-3.8-pandas-latest)|
   |test-conda-python-3.8-pandas-nightly|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-test-conda-python-3.8-pandas-nightly)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-test-conda-python-3.8-pandas-nightly)|
   |test-conda-python-3.8-spark-master|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-test-conda-python-3.8-spark-master)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-test-conda-python-3.8-spark-master)|
   |test-debian-10-cpp|[![CircleCI](https://img.shields.io/circleci/build/github/ursacomputing/crossbow/actions-59-circle-test-debian-10-cpp.svg)](https://circleci.com/gh/ursacomputing/crossbow/tree/actions-59-circle-test-debian-10-cpp)|
   |test-debian-10-go-1.12|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-59-azure-test-debian-10-go-1.12)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-59-azure-test-debian-10-go-1.12)|
   |test-debian-10-python-3|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-59-azure-test-debian-10-python-3)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-59-azure-test-debian-10-python-3)|
   |test-debian-c-glib|[![CircleCI](https://img.shields.io/circleci/build/github/ursacomputing/crossbow/actions-59-circle-test-debian-c-glib.svg)](https://circleci.com/gh/ursacomputing/crossbow/tree/actions-59-circle-test-debian-c-glib)|
   |test-debian-ruby|[![CircleCI](https://img.shields.io/circleci/build/github/ursacomputing/crossbow/actions-59-circle-test-debian-ruby.svg)](https://circleci.com/gh/ursacomputing/crossbow/tree/actions-59-circle-test-debian-ruby)|
   |test-fedora-33-cpp|[![CircleCI](https://img.shields.io/circleci/build/github/ursacomputing/crossbow/actions-59-circle-test-fedora-33-cpp.svg)](https://circleci.com/gh/ursacomputing/crossbow/tree/actions-59-circle-test-fedora-33-cpp)|
   |test-fedora-33-python-3|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-59-azure-test-fedora-33-python-3)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-59-azure-test-fedora-33-python-3)|
   |test-r-linux-as-cran|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-test-r-linux-as-cran)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-test-r-linux-as-cran)|
   |test-r-rhub-ubuntu-gcc-release|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-59-azure-test-r-rhub-ubuntu-gcc-release)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-59-azure-test-r-rhub-ubuntu-gcc-release)|
   |test-r-rocker-r-base-latest|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-59-azure-test-r-rocker-r-base-latest)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-59-azure-test-r-rocker-r-base-latest)|
   |test-r-rstudio-r-base-3.6-bionic|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-59-azure-test-r-rstudio-r-base-3.6-bionic)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-59-azure-test-r-rstudio-r-base-3.6-bionic)|
   |test-r-rstudio-r-base-3.6-centos7-devtoolset-8|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-59-azure-test-r-rstudio-r-base-3.6-centos7-devtoolset-8)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-59-azure-test-r-rstudio-r-base-3.6-centos7-devtoolset-8)|
   |test-r-rstudio-r-base-3.6-centos8|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-59-azure-test-r-rstudio-r-base-3.6-centos8)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-59-azure-test-r-rstudio-r-base-3.6-centos8)|
   |test-r-rstudio-r-base-3.6-opensuse15|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-59-azure-test-r-rstudio-r-base-3.6-opensuse15)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-59-azure-test-r-rstudio-r-base-3.6-opensuse15)|
   |test-r-rstudio-r-base-3.6-opensuse42|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-59-azure-test-r-rstudio-r-base-3.6-opensuse42)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-59-azure-test-r-rstudio-r-base-3.6-opensuse42)|
   |test-r-version-compatibility|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-test-r-version-compatibility)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-test-r-version-compatibility)|
   |test-r-versions|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-test-r-versions)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-test-r-versions)|
   |test-ubuntu-16.04-cpp|[![CircleCI](https://img.shields.io/circleci/build/github/ursacomputing/crossbow/actions-59-circle-test-ubuntu-16.04-cpp.svg)](https://circleci.com/gh/ursacomputing/crossbow/tree/actions-59-circle-test-ubuntu-16.04-cpp)|
   |test-ubuntu-18.04-cpp|[![CircleCI](https://img.shields.io/circleci/build/github/ursacomputing/crossbow/actions-59-circle-test-ubuntu-18.04-cpp.svg)](https://circleci.com/gh/ursacomputing/crossbow/tree/actions-59-circle-test-ubuntu-18.04-cpp)|
   |test-ubuntu-18.04-cpp-release|[![CircleCI](https://img.shields.io/circleci/build/github/ursacomputing/crossbow/actions-59-circle-test-ubuntu-18.04-cpp-release.svg)](https://circleci.com/gh/ursacomputing/crossbow/tree/actions-59-circle-test-ubuntu-18.04-cpp-release)|
   |test-ubuntu-18.04-cpp-static|[![CircleCI](https://img.shields.io/circleci/build/github/ursacomputing/crossbow/actions-59-circle-test-ubuntu-18.04-cpp-static.svg)](https://circleci.com/gh/ursacomputing/crossbow/tree/actions-59-circle-test-ubuntu-18.04-cpp-static)|
   |test-ubuntu-18.04-docs|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-59-azure-test-ubuntu-18.04-docs)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-59-azure-test-ubuntu-18.04-docs)|
   |test-ubuntu-18.04-python-3|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-59-azure-test-ubuntu-18.04-python-3)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-59-azure-test-ubuntu-18.04-python-3)|
   |test-ubuntu-18.04-r-sanitizer|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-59-azure-test-ubuntu-18.04-r-sanitizer)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-59-azure-test-ubuntu-18.04-r-sanitizer)|
   |test-ubuntu-20.04-cpp|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-test-ubuntu-20.04-cpp)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-test-ubuntu-20.04-cpp)|
   |test-ubuntu-20.04-cpp-14|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-test-ubuntu-20.04-cpp-14)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-test-ubuntu-20.04-cpp-14)|
   |test-ubuntu-20.04-cpp-17|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-test-ubuntu-20.04-cpp-17)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-test-ubuntu-20.04-cpp-17)|
   |test-ubuntu-20.04-cpp-thread-sanitizer|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-test-ubuntu-20.04-cpp-thread-sanitizer)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-test-ubuntu-20.04-cpp-thread-sanitizer)|
   |test-ubuntu-c-glib|[![CircleCI](https://img.shields.io/circleci/build/github/ursacomputing/crossbow/actions-59-circle-test-ubuntu-c-glib.svg)](https://circleci.com/gh/ursacomputing/crossbow/tree/actions-59-circle-test-ubuntu-c-glib)|
   |test-ubuntu-ruby|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-59-azure-test-ubuntu-ruby)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-59-azure-test-ubuntu-ruby)|
   |ubuntu-bionic-amd64|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-ubuntu-bionic-amd64)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-ubuntu-bionic-amd64)|
   |ubuntu-focal-amd64|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-ubuntu-focal-amd64)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-ubuntu-focal-amd64)|
   |ubuntu-groovy-amd64|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-ubuntu-groovy-amd64)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-ubuntu-groovy-amd64)|
   |ubuntu-xenial-amd64|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-ubuntu-xenial-amd64)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-ubuntu-xenial-amd64)|
   |wheel-manylinux2010-cp36m|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-wheel-manylinux2010-cp36m)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-wheel-manylinux2010-cp36m)|
   |wheel-manylinux2010-cp37m|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-wheel-manylinux2010-cp37m)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-wheel-manylinux2010-cp37m)|
   |wheel-manylinux2010-cp38|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-wheel-manylinux2010-cp38)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-wheel-manylinux2010-cp38)|
   |wheel-manylinux2010-cp39|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-wheel-manylinux2010-cp39)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-wheel-manylinux2010-cp39)|
   |wheel-manylinux2014-cp36m|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-wheel-manylinux2014-cp36m)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-wheel-manylinux2014-cp36m)|
   |wheel-manylinux2014-cp37m|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-wheel-manylinux2014-cp37m)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-wheel-manylinux2014-cp37m)|
   |wheel-manylinux2014-cp38|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-wheel-manylinux2014-cp38)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-wheel-manylinux2014-cp38)|
   |wheel-manylinux2014-cp39|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-wheel-manylinux2014-cp39)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-wheel-manylinux2014-cp39)|
   |wheel-osx-high-sierra-cp36m|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-wheel-osx-high-sierra-cp36m)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-wheel-osx-high-sierra-cp36m)|
   |wheel-osx-high-sierra-cp37m|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-wheel-osx-high-sierra-cp37m)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-wheel-osx-high-sierra-cp37m)|
   |wheel-osx-high-sierra-cp38|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-wheel-osx-high-sierra-cp38)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-wheel-osx-high-sierra-cp38)|
   |wheel-osx-high-sierra-cp39|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-wheel-osx-high-sierra-cp39)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-wheel-osx-high-sierra-cp39)|
   |wheel-osx-mavericks-cp36m|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-wheel-osx-mavericks-cp36m)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-wheel-osx-mavericks-cp36m)|
   |wheel-osx-mavericks-cp37m|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-wheel-osx-mavericks-cp37m)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-wheel-osx-mavericks-cp37m)|
   |wheel-osx-mavericks-cp38|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-wheel-osx-mavericks-cp38)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-wheel-osx-mavericks-cp38)|
   |wheel-osx-mavericks-cp39|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-wheel-osx-mavericks-cp39)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-wheel-osx-mavericks-cp39)|
   |wheel-windows-cp36m|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-wheel-windows-cp36m)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-wheel-windows-cp36m)|
   |wheel-windows-cp37m|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-wheel-windows-cp37m)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-wheel-windows-cp37m)|
   |wheel-windows-cp38|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-wheel-windows-cp38)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-wheel-windows-cp38)|
   |wheel-windows-cp39|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-59-github-wheel-windows-cp39)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-59-github-wheel-windows-cp39)|


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] kou commented on a change in pull request #9367: ARROW-7288: [C++][Parquet] Don't use regular expression to parse application version

Posted by GitBox <gi...@apache.org>.
kou commented on a change in pull request #9367:
URL: https://github.com/apache/arrow/pull/9367#discussion_r568335632



##########
File path: cpp/src/parquet/metadata.cc
##########
@@ -949,43 +910,283 @@ ApplicationVersion::ApplicationVersion(std::string application, int major, int m
                                        int patch)
     : application_(std::move(application)), version{major, minor, patch, "", "", ""} {}
 
-ApplicationVersion::ApplicationVersion(const std::string& created_by) {
-  // Use singletons to compile only once (ARROW-9863)
-  static regex app_regex{ApplicationVersion::APPLICATION_FORMAT};
-  static regex ver_regex{ApplicationVersion::VERSION_FORMAT};
-  smatch app_matches;
-  smatch ver_matches;
-
-  std::string created_by_lower = created_by;
-  std::transform(created_by_lower.begin(), created_by_lower.end(),
-                 created_by_lower.begin(), ::tolower);
-
-  bool app_success = regex_match(created_by_lower, app_matches, app_regex);
-  bool ver_success = false;
-  std::string version_str;
-
-  if (app_success && app_matches.size() >= 4) {
-    // first match is the entire string. sub-matches start from second.
-    application_ = app_matches[1];
-    version_str = app_matches[3];
-    build_ = app_matches[4];
-    ver_success = regex_match(version_str, ver_matches, ver_regex);
-  } else {
-    application_ = "unknown";
-  }
-
-  if (ver_success && ver_matches.size() >= 7) {
-    version.major = atoi(ver_matches[1].str().c_str());
-    version.minor = atoi(ver_matches[2].str().c_str());
-    version.patch = atoi(ver_matches[3].str().c_str());
-    version.unknown = ver_matches[4].str();
-    version.pre_release = ver_matches[5].str();
-    version.build_info = ver_matches[6].str();
-  } else {
-    version.major = 0;
-    version.minor = 0;
-    version.patch = 0;
+namespace {
+// Parse the application version format and set parsed values to
+// ApplicationVersion.
+//
+// The application version format must be compatible parquet-mr's
+// one. See also:
+//   * https://github.com/apache/parquet-mr/blob/master/parquet-common/src/main/java/org/apache/parquet/VersionParser.java
+//   * https://github.com/apache/parquet-mr/blob/master/parquet-common/src/main/java/org/apache/parquet/SemanticVersion.java
+//
+// The application version format:
+//   "${APPLICATION_NAME}"
+//   "${APPLICATION_NAME} version ${VERSION}"
+//   "${APPLICATION_NAME} version ${VERSION} (build ${BUILD_NAME})"
+//
+// Eg:
+//   parquet-cpp
+//   parquet-cpp version 1.5.0ab-xyz5.5.0+cd
+//   parquet-cpp version 1.5.0ab-xyz5.5.0+cd (build abcd)
+//
+// The VERSION format:
+//   "${MAJOR}.${MINOR}.${PATCH}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}-${PRE_RELEASE}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}-${PRE_RELEASE}+${BUILD_INFO}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}+${BUILD_INFO}"
+//   "${MAJOR}.${MINOR}.${PATCH}-${PRE_RELEASE}"
+//   "${MAJOR}.${MINOR}.${PATCH}-${PRE_RELEASE}+${BUILD_INFO}"
+//   "${MAJOR}.${MINOR}.${PATCH}+${BUILD_INFO}"
+//
+// Eg:
+//   1.5.0
+//   1.5.0ab
+//   1.5.0ab-cdh5.5.0
+//   1.5.0ab-cdh5.5.0+cd
+//   1.5.0ab+cd
+//   1.5.0-cdh5.5.0
+//   1.5.0-cdh5.5.0+cd
+//   1.5.0+cd
+class ApplicationVersionParser {
+ public:
+  ApplicationVersionParser(const std::string& created_by,
+                           ApplicationVersion& application_version)
+      : created_by_(created_by),
+        application_version_(application_version),
+        digit_("0123456789") {}
+
+  void Parse() {
+    application_version_.application_ = "unknown";
+    application_version_.version = {0, 0, 0, "", "", ""};
+
+    if (!ParseApplicationName()) {
+      return;
+    }
+    if (!ParseVersion()) {
+      return;
+    }
+    if (!ParseBuildName()) {
+      return;
+    }
+  }
+
+ private:
+  void RemovePrecedingSpaces(const std::string& string, size_t& start,
+                             const size_t& end) {
+    while (start < end && string[start] == ' ') {

Review comment:
       I've added `\t`, `\v`, `\r`, `\n` and `\f` to supported whitespace characters but they will not be appeared as you said.

##########
File path: cpp/src/parquet/metadata.cc
##########
@@ -949,43 +910,283 @@ ApplicationVersion::ApplicationVersion(std::string application, int major, int m
                                        int patch)
     : application_(std::move(application)), version{major, minor, patch, "", "", ""} {}
 
-ApplicationVersion::ApplicationVersion(const std::string& created_by) {
-  // Use singletons to compile only once (ARROW-9863)
-  static regex app_regex{ApplicationVersion::APPLICATION_FORMAT};
-  static regex ver_regex{ApplicationVersion::VERSION_FORMAT};
-  smatch app_matches;
-  smatch ver_matches;
-
-  std::string created_by_lower = created_by;
-  std::transform(created_by_lower.begin(), created_by_lower.end(),
-                 created_by_lower.begin(), ::tolower);
-
-  bool app_success = regex_match(created_by_lower, app_matches, app_regex);
-  bool ver_success = false;
-  std::string version_str;
-
-  if (app_success && app_matches.size() >= 4) {
-    // first match is the entire string. sub-matches start from second.
-    application_ = app_matches[1];
-    version_str = app_matches[3];
-    build_ = app_matches[4];
-    ver_success = regex_match(version_str, ver_matches, ver_regex);
-  } else {
-    application_ = "unknown";
-  }
-
-  if (ver_success && ver_matches.size() >= 7) {
-    version.major = atoi(ver_matches[1].str().c_str());
-    version.minor = atoi(ver_matches[2].str().c_str());
-    version.patch = atoi(ver_matches[3].str().c_str());
-    version.unknown = ver_matches[4].str();
-    version.pre_release = ver_matches[5].str();
-    version.build_info = ver_matches[6].str();
-  } else {
-    version.major = 0;
-    version.minor = 0;
-    version.patch = 0;
+namespace {
+// Parse the application version format and set parsed values to
+// ApplicationVersion.
+//
+// The application version format must be compatible parquet-mr's
+// one. See also:
+//   * https://github.com/apache/parquet-mr/blob/master/parquet-common/src/main/java/org/apache/parquet/VersionParser.java
+//   * https://github.com/apache/parquet-mr/blob/master/parquet-common/src/main/java/org/apache/parquet/SemanticVersion.java
+//
+// The application version format:
+//   "${APPLICATION_NAME}"
+//   "${APPLICATION_NAME} version ${VERSION}"
+//   "${APPLICATION_NAME} version ${VERSION} (build ${BUILD_NAME})"
+//
+// Eg:
+//   parquet-cpp
+//   parquet-cpp version 1.5.0ab-xyz5.5.0+cd
+//   parquet-cpp version 1.5.0ab-xyz5.5.0+cd (build abcd)
+//
+// The VERSION format:
+//   "${MAJOR}.${MINOR}.${PATCH}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}-${PRE_RELEASE}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}-${PRE_RELEASE}+${BUILD_INFO}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}+${BUILD_INFO}"
+//   "${MAJOR}.${MINOR}.${PATCH}-${PRE_RELEASE}"
+//   "${MAJOR}.${MINOR}.${PATCH}-${PRE_RELEASE}+${BUILD_INFO}"
+//   "${MAJOR}.${MINOR}.${PATCH}+${BUILD_INFO}"
+//
+// Eg:
+//   1.5.0
+//   1.5.0ab
+//   1.5.0ab-cdh5.5.0
+//   1.5.0ab-cdh5.5.0+cd
+//   1.5.0ab+cd
+//   1.5.0-cdh5.5.0
+//   1.5.0-cdh5.5.0+cd
+//   1.5.0+cd
+class ApplicationVersionParser {
+ public:
+  ApplicationVersionParser(const std::string& created_by,
+                           ApplicationVersion& application_version)
+      : created_by_(created_by),
+        application_version_(application_version),
+        digit_("0123456789") {}
+
+  void Parse() {
+    application_version_.application_ = "unknown";
+    application_version_.version = {0, 0, 0, "", "", ""};
+
+    if (!ParseApplicationName()) {
+      return;
+    }
+    if (!ParseVersion()) {
+      return;
+    }
+    if (!ParseBuildName()) {
+      return;
+    }
+  }
+
+ private:
+  void RemovePrecedingSpaces(const std::string& string, size_t& start,
+                             const size_t& end) {
+    while (start < end && string[start] == ' ') {
+      ++start;
+    }
+  }
+
+  void RemoveTrailingSpaces(const std::string& string, const size_t& start, size_t& end) {
+    while (start < (end - 1) && (end - 1) < string.size() && string[end - 1] == ' ') {
+      --end;
+    }
   }
+
+  bool ParseApplicationName() {
+    std::string version_mark(" version ");
+    auto version_mark_position = created_by_.find(version_mark);
+    size_t application_name_end;
+    // No VERSION and BUILD_NAME.
+    if (version_mark_position == std::string::npos) {
+      version_start_ = std::string::npos;
+      application_name_end = created_by_.size();
+    } else {
+      version_start_ = version_mark_position + version_mark.size();
+      application_name_end = version_mark_position;
+    }
+
+    size_t application_name_start = 0;
+    RemovePrecedingSpaces(created_by_, application_name_start, application_name_end);
+    RemoveTrailingSpaces(created_by_, application_name_start, application_name_end);
+    application_version_.application_ = created_by_.substr(
+        application_name_start, application_name_end - application_name_start);
+
+    return true;
+  }
+
+  bool ParseVersion() {
+    // No VERSION.
+    if (version_start_ == std::string::npos) {
+      return true;
+    }
+
+    RemovePrecedingSpaces(created_by_, version_start_, created_by_.size());
+    version_end_ = created_by_.find(" (", version_start_);
+    // No BUILD_NAME.
+    if (version_end_ == std::string::npos) {
+      version_end_ = created_by_.size();
+    }
+    RemoveTrailingSpaces(created_by_, version_start_, version_end_);
+    // No VERSION.
+    if (version_start_ == version_end_) {
+      return false;
+    }
+    version_string_ = created_by_.substr(version_start_, version_end_ - version_start_);
+
+    if (!ParseVersionMajor()) {
+      return false;
+    }
+    if (!ParseVersionMinor()) {
+      return false;
+    }
+    if (!ParseVersionPatch()) {
+      return false;
+    }
+    if (!ParseVersionUnknown()) {
+      return false;
+    }
+    if (!ParseVersionPreRelease()) {
+      return false;
+    }
+    if (!ParseVersionBuildInfo()) {
+      return false;
+    }
+
+    return true;
+  }
+
+  bool ParseVersionMajor() {
+    size_t version_major_start = 0;
+    auto version_major_end = version_string_.find_first_not_of(digit_);
+    // No ".".
+    if (version_major_end == std::string::npos ||

Review comment:
       We can support `${MAJOR}` only case but it's not accepted in the original implementation. So I rejected the case.
   
   But I've added support for `${MAJOR}` because we can be liberal for input.

##########
File path: cpp/src/parquet/metadata.h
##########
@@ -58,16 +58,6 @@ class PARQUET_EXPORT ApplicationVersion {
   static const ApplicationVersion& PARQUET_816_FIXED_VERSION();
   static const ApplicationVersion& PARQUET_CPP_FIXED_STATS_VERSION();
   static const ApplicationVersion& PARQUET_MR_FIXED_STATS_VERSION();
-  // Regular expression for the version format
-  // major . minor . patch unknown - prerelease.x + build info
-  // Eg: 1.5.0ab-cdh5.5.0+cd
-  static constexpr char const* VERSION_FORMAT =
-      "^(\\d+)\\.(\\d+)\\.(\\d+)([^-+]*)?(?:-([^+]*))?(?:\\+(.*))?$";
-  // Regular expression for the application format
-  // application_name version VERSION_FORMAT (build build_name)
-  // Eg: parquet-cpp version 1.5.0ab-xyz5.5.0+cd (build abcd)
-  static constexpr char const* APPLICATION_FORMAT =
-      "(.*?)\\s*(?:(version\\s*(?:([^(]*?)\\s*(?:\\(\\s*build\\s*([^)]*?)\\s*\\))?)?)?)";
 

Review comment:
       Good catch!
   I've removed.

##########
File path: cpp/src/parquet/metadata_test.cc
##########
@@ -342,5 +342,123 @@ TEST(ApplicationVersion, Basics) {
       version1.HasCorrectStatistics(Type::BYTE_ARRAY, stats_int, SortOrder::UNSIGNED));
 }
 
+TEST(ApplicationVersion, Empty) {
+  ApplicationVersion version("");
+
+  ASSERT_EQ("", version.application_);
+  ASSERT_EQ("", version.build_);
+  ASSERT_EQ(0, version.version.major);
+  ASSERT_EQ(0, version.version.minor);
+  ASSERT_EQ(0, version.version.patch);
+  ASSERT_EQ("", version.version.unknown);
+  ASSERT_EQ("", version.version.pre_release);
+  ASSERT_EQ("", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionEmpty) {
+  ApplicationVersion version("parquet-mr version ");
+
+  ASSERT_EQ("parquet-mr", version.application_);
+  ASSERT_EQ("", version.build_);
+  ASSERT_EQ(0, version.version.major);
+  ASSERT_EQ(0, version.version.minor);
+  ASSERT_EQ(0, version.version.patch);
+  ASSERT_EQ("", version.version.unknown);
+  ASSERT_EQ("", version.version.pre_release);
+  ASSERT_EQ("", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionNoMajor) {
+  ApplicationVersion version("parquet-mr version .");
+
+  ASSERT_EQ("parquet-mr", version.application_);
+  ASSERT_EQ("", version.build_);
+  ASSERT_EQ(0, version.version.major);
+  ASSERT_EQ(0, version.version.minor);
+  ASSERT_EQ(0, version.version.patch);
+  ASSERT_EQ("", version.version.unknown);
+  ASSERT_EQ("", version.version.pre_release);
+  ASSERT_EQ("", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionNoMinor) {
+  ApplicationVersion version("parquet-mr version 1.");
+
+  ASSERT_EQ("parquet-mr", version.application_);
+  ASSERT_EQ("", version.build_);
+  ASSERT_EQ(1, version.version.major);
+  ASSERT_EQ(0, version.version.minor);
+  ASSERT_EQ(0, version.version.patch);
+  ASSERT_EQ("", version.version.unknown);
+  ASSERT_EQ("", version.version.pre_release);
+  ASSERT_EQ("", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionNoPatch) {
+  ApplicationVersion version("parquet-mr version 1.7.");
+
+  ASSERT_EQ("parquet-mr", version.application_);
+  ASSERT_EQ("", version.build_);
+  ASSERT_EQ(1, version.version.major);
+  ASSERT_EQ(7, version.version.minor);
+  ASSERT_EQ(0, version.version.patch);
+  ASSERT_EQ("", version.version.unknown);
+  ASSERT_EQ("", version.version.pre_release);
+  ASSERT_EQ("", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionNoUnknown) {
+  ApplicationVersion version("parquet-mr version 1.7.9-cdh5.5.0+cd");
+
+  ASSERT_EQ("parquet-mr", version.application_);
+  ASSERT_EQ("", version.build_);
+  ASSERT_EQ(1, version.version.major);
+  ASSERT_EQ(7, version.version.minor);
+  ASSERT_EQ(9, version.version.patch);
+  ASSERT_EQ("", version.version.unknown);
+  ASSERT_EQ("cdh5.5.0", version.version.pre_release);
+  ASSERT_EQ("cd", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionNoPreRelease) {
+  ApplicationVersion version("parquet-mr version 1.7.9ab+cd");
+
+  ASSERT_EQ("parquet-mr", version.application_);
+  ASSERT_EQ("", version.build_);
+  ASSERT_EQ(1, version.version.major);
+  ASSERT_EQ(7, version.version.minor);
+  ASSERT_EQ(9, version.version.patch);
+  ASSERT_EQ("ab", version.version.unknown);
+  ASSERT_EQ("", version.version.pre_release);
+  ASSERT_EQ("cd", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionNoUnknownNoPreRelease) {
+  ApplicationVersion version("parquet-mr version 1.7.9+cd");
+
+  ASSERT_EQ("parquet-mr", version.application_);
+  ASSERT_EQ("", version.build_);
+  ASSERT_EQ(1, version.version.major);
+  ASSERT_EQ(7, version.version.minor);
+  ASSERT_EQ(9, version.version.patch);
+  ASSERT_EQ("", version.version.unknown);
+  ASSERT_EQ("", version.version.pre_release);
+  ASSERT_EQ("cd", version.version.build_info);
+}
+
+TEST(ApplicationVersion, FullWithSpaces) {
+  ApplicationVersion version(
+      " parquet-mr  version  1.5.3ab-cdh5.5.0+cd  (build  abcd ) ");
+
+  ASSERT_EQ("parquet-mr", version.application_);
+  ASSERT_EQ("abcd", version.build_);
+  ASSERT_EQ(1, version.version.major);
+  ASSERT_EQ(5, version.version.minor);
+  ASSERT_EQ(3, version.version.patch);
+  ASSERT_EQ("ab", version.version.unknown);
+  ASSERT_EQ("cdh5.5.0", version.version.pre_release);
+  ASSERT_EQ("cd", version.version.build_info);
+}
+

Review comment:
       I've add some more tests.
   
   Note that this implementation assumes that input encoding is ASCII. It may not work with other encodings.
   Should we support non ASCII encodings?

##########
File path: cpp/src/parquet/metadata.cc
##########
@@ -949,43 +910,283 @@ ApplicationVersion::ApplicationVersion(std::string application, int major, int m
                                        int patch)
     : application_(std::move(application)), version{major, minor, patch, "", "", ""} {}
 
-ApplicationVersion::ApplicationVersion(const std::string& created_by) {
-  // Use singletons to compile only once (ARROW-9863)
-  static regex app_regex{ApplicationVersion::APPLICATION_FORMAT};
-  static regex ver_regex{ApplicationVersion::VERSION_FORMAT};
-  smatch app_matches;
-  smatch ver_matches;
-
-  std::string created_by_lower = created_by;
-  std::transform(created_by_lower.begin(), created_by_lower.end(),
-                 created_by_lower.begin(), ::tolower);
-
-  bool app_success = regex_match(created_by_lower, app_matches, app_regex);
-  bool ver_success = false;
-  std::string version_str;
-
-  if (app_success && app_matches.size() >= 4) {
-    // first match is the entire string. sub-matches start from second.
-    application_ = app_matches[1];
-    version_str = app_matches[3];
-    build_ = app_matches[4];
-    ver_success = regex_match(version_str, ver_matches, ver_regex);
-  } else {
-    application_ = "unknown";
-  }
-
-  if (ver_success && ver_matches.size() >= 7) {
-    version.major = atoi(ver_matches[1].str().c_str());
-    version.minor = atoi(ver_matches[2].str().c_str());
-    version.patch = atoi(ver_matches[3].str().c_str());
-    version.unknown = ver_matches[4].str();
-    version.pre_release = ver_matches[5].str();
-    version.build_info = ver_matches[6].str();
-  } else {
-    version.major = 0;
-    version.minor = 0;
-    version.patch = 0;
+namespace {
+// Parse the application version format and set parsed values to
+// ApplicationVersion.
+//
+// The application version format must be compatible parquet-mr's
+// one. See also:
+//   * https://github.com/apache/parquet-mr/blob/master/parquet-common/src/main/java/org/apache/parquet/VersionParser.java
+//   * https://github.com/apache/parquet-mr/blob/master/parquet-common/src/main/java/org/apache/parquet/SemanticVersion.java
+//
+// The application version format:
+//   "${APPLICATION_NAME}"
+//   "${APPLICATION_NAME} version ${VERSION}"
+//   "${APPLICATION_NAME} version ${VERSION} (build ${BUILD_NAME})"
+//
+// Eg:
+//   parquet-cpp
+//   parquet-cpp version 1.5.0ab-xyz5.5.0+cd
+//   parquet-cpp version 1.5.0ab-xyz5.5.0+cd (build abcd)
+//
+// The VERSION format:
+//   "${MAJOR}.${MINOR}.${PATCH}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}-${PRE_RELEASE}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}-${PRE_RELEASE}+${BUILD_INFO}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}+${BUILD_INFO}"
+//   "${MAJOR}.${MINOR}.${PATCH}-${PRE_RELEASE}"
+//   "${MAJOR}.${MINOR}.${PATCH}-${PRE_RELEASE}+${BUILD_INFO}"
+//   "${MAJOR}.${MINOR}.${PATCH}+${BUILD_INFO}"
+//
+// Eg:
+//   1.5.0
+//   1.5.0ab
+//   1.5.0ab-cdh5.5.0
+//   1.5.0ab-cdh5.5.0+cd
+//   1.5.0ab+cd
+//   1.5.0-cdh5.5.0
+//   1.5.0-cdh5.5.0+cd
+//   1.5.0+cd
+class ApplicationVersionParser {
+ public:
+  ApplicationVersionParser(const std::string& created_by,
+                           ApplicationVersion& application_version)
+      : created_by_(created_by),
+        application_version_(application_version),
+        digit_("0123456789") {}
+
+  void Parse() {
+    application_version_.application_ = "unknown";
+    application_version_.version = {0, 0, 0, "", "", ""};
+
+    if (!ParseApplicationName()) {
+      return;
+    }
+    if (!ParseVersion()) {
+      return;
+    }
+    if (!ParseBuildName()) {
+      return;
+    }
+  }
+
+ private:
+  void RemovePrecedingSpaces(const std::string& string, size_t& start,
+                             const size_t& end) {
+    while (start < end && string[start] == ' ') {
+      ++start;
+    }
+  }
+
+  void RemoveTrailingSpaces(const std::string& string, const size_t& start, size_t& end) {
+    while (start < (end - 1) && (end - 1) < string.size() && string[end - 1] == ' ') {
+      --end;
+    }
   }
+
+  bool ParseApplicationName() {
+    std::string version_mark(" version ");
+    auto version_mark_position = created_by_.find(version_mark);
+    size_t application_name_end;
+    // No VERSION and BUILD_NAME.
+    if (version_mark_position == std::string::npos) {
+      version_start_ = std::string::npos;
+      application_name_end = created_by_.size();
+    } else {
+      version_start_ = version_mark_position + version_mark.size();
+      application_name_end = version_mark_position;
+    }
+
+    size_t application_name_start = 0;
+    RemovePrecedingSpaces(created_by_, application_name_start, application_name_end);
+    RemoveTrailingSpaces(created_by_, application_name_start, application_name_end);
+    application_version_.application_ = created_by_.substr(
+        application_name_start, application_name_end - application_name_start);
+
+    return true;
+  }
+
+  bool ParseVersion() {
+    // No VERSION.
+    if (version_start_ == std::string::npos) {
+      return true;
+    }
+
+    RemovePrecedingSpaces(created_by_, version_start_, created_by_.size());
+    version_end_ = created_by_.find(" (", version_start_);
+    // No BUILD_NAME.
+    if (version_end_ == std::string::npos) {
+      version_end_ = created_by_.size();
+    }
+    RemoveTrailingSpaces(created_by_, version_start_, version_end_);
+    // No VERSION.
+    if (version_start_ == version_end_) {
+      return false;
+    }
+    version_string_ = created_by_.substr(version_start_, version_end_ - version_start_);
+
+    if (!ParseVersionMajor()) {
+      return false;
+    }
+    if (!ParseVersionMinor()) {
+      return false;
+    }
+    if (!ParseVersionPatch()) {
+      return false;
+    }
+    if (!ParseVersionUnknown()) {
+      return false;
+    }
+    if (!ParseVersionPreRelease()) {
+      return false;
+    }
+    if (!ParseVersionBuildInfo()) {
+      return false;
+    }
+
+    return true;
+  }
+
+  bool ParseVersionMajor() {
+    size_t version_major_start = 0;
+    auto version_major_end = version_string_.find_first_not_of(digit_);
+    // No ".".
+    if (version_major_end == std::string::npos ||
+        version_string_[version_major_end] != '.') {
+      return false;
+    }
+    // No MAJOR.
+    if (version_major_end == version_major_start) {
+      return false;
+    }
+    auto version_major_string = version_string_.substr(
+        version_major_start, version_major_end - version_major_start);
+    application_version_.version.major = atoi(version_major_string.c_str());
+    version_parsing_position_ = version_major_end + 1;  // +1 is for '.'.
+    return true;
+  }
+
+  bool ParseVersionMinor() {
+    auto version_minor_start = version_parsing_position_;
+    auto version_minor_end =
+        version_string_.find_first_not_of(digit_, version_minor_start);
+    if (version_minor_end == std::string::npos ||

Review comment:
       Ditto.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou commented on a change in pull request #9367: ARROW-7288: [C++][Parquet] Don't use regular expression to parse application version

Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #9367:
URL: https://github.com/apache/arrow/pull/9367#discussion_r568015264



##########
File path: cpp/src/parquet/metadata.h
##########
@@ -58,16 +58,6 @@ class PARQUET_EXPORT ApplicationVersion {
   static const ApplicationVersion& PARQUET_816_FIXED_VERSION();
   static const ApplicationVersion& PARQUET_CPP_FIXED_STATS_VERSION();
   static const ApplicationVersion& PARQUET_MR_FIXED_STATS_VERSION();
-  // Regular expression for the version format
-  // major . minor . patch unknown - prerelease.x + build info
-  // Eg: 1.5.0ab-cdh5.5.0+cd
-  static constexpr char const* VERSION_FORMAT =
-      "^(\\d+)\\.(\\d+)\\.(\\d+)([^-+]*)?(?:-([^+]*))?(?:\\+(.*))?$";
-  // Regular expression for the application format
-  // application_name version VERSION_FORMAT (build build_name)
-  // Eg: parquet-cpp version 1.5.0ab-xyz5.5.0+cd (build abcd)
-  static constexpr char const* APPLICATION_FORMAT =
-      "(.*?)\\s*(?:(version\\s*(?:([^(]*?)\\s*(?:\\(\\s*build\\s*([^)]*?)\\s*\\))?)?)?)";
 

Review comment:
       Does the comment "TODO (majetideepak): Implement support for pre_release" below need to be removed?

##########
File path: cpp/src/parquet/metadata.cc
##########
@@ -949,43 +910,283 @@ ApplicationVersion::ApplicationVersion(std::string application, int major, int m
                                        int patch)
     : application_(std::move(application)), version{major, minor, patch, "", "", ""} {}
 
-ApplicationVersion::ApplicationVersion(const std::string& created_by) {
-  // Use singletons to compile only once (ARROW-9863)
-  static regex app_regex{ApplicationVersion::APPLICATION_FORMAT};
-  static regex ver_regex{ApplicationVersion::VERSION_FORMAT};
-  smatch app_matches;
-  smatch ver_matches;
-
-  std::string created_by_lower = created_by;
-  std::transform(created_by_lower.begin(), created_by_lower.end(),
-                 created_by_lower.begin(), ::tolower);
-
-  bool app_success = regex_match(created_by_lower, app_matches, app_regex);
-  bool ver_success = false;
-  std::string version_str;
-
-  if (app_success && app_matches.size() >= 4) {
-    // first match is the entire string. sub-matches start from second.
-    application_ = app_matches[1];
-    version_str = app_matches[3];
-    build_ = app_matches[4];
-    ver_success = regex_match(version_str, ver_matches, ver_regex);
-  } else {
-    application_ = "unknown";
-  }
-
-  if (ver_success && ver_matches.size() >= 7) {
-    version.major = atoi(ver_matches[1].str().c_str());
-    version.minor = atoi(ver_matches[2].str().c_str());
-    version.patch = atoi(ver_matches[3].str().c_str());
-    version.unknown = ver_matches[4].str();
-    version.pre_release = ver_matches[5].str();
-    version.build_info = ver_matches[6].str();
-  } else {
-    version.major = 0;
-    version.minor = 0;
-    version.patch = 0;
+namespace {
+// Parse the application version format and set parsed values to
+// ApplicationVersion.
+//
+// The application version format must be compatible parquet-mr's
+// one. See also:
+//   * https://github.com/apache/parquet-mr/blob/master/parquet-common/src/main/java/org/apache/parquet/VersionParser.java
+//   * https://github.com/apache/parquet-mr/blob/master/parquet-common/src/main/java/org/apache/parquet/SemanticVersion.java
+//
+// The application version format:
+//   "${APPLICATION_NAME}"
+//   "${APPLICATION_NAME} version ${VERSION}"
+//   "${APPLICATION_NAME} version ${VERSION} (build ${BUILD_NAME})"
+//
+// Eg:
+//   parquet-cpp
+//   parquet-cpp version 1.5.0ab-xyz5.5.0+cd
+//   parquet-cpp version 1.5.0ab-xyz5.5.0+cd (build abcd)
+//
+// The VERSION format:
+//   "${MAJOR}.${MINOR}.${PATCH}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}-${PRE_RELEASE}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}-${PRE_RELEASE}+${BUILD_INFO}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}+${BUILD_INFO}"
+//   "${MAJOR}.${MINOR}.${PATCH}-${PRE_RELEASE}"
+//   "${MAJOR}.${MINOR}.${PATCH}-${PRE_RELEASE}+${BUILD_INFO}"
+//   "${MAJOR}.${MINOR}.${PATCH}+${BUILD_INFO}"
+//
+// Eg:
+//   1.5.0
+//   1.5.0ab
+//   1.5.0ab-cdh5.5.0
+//   1.5.0ab-cdh5.5.0+cd
+//   1.5.0ab+cd
+//   1.5.0-cdh5.5.0
+//   1.5.0-cdh5.5.0+cd
+//   1.5.0+cd
+class ApplicationVersionParser {
+ public:
+  ApplicationVersionParser(const std::string& created_by,
+                           ApplicationVersion& application_version)
+      : created_by_(created_by),
+        application_version_(application_version),
+        digit_("0123456789") {}
+
+  void Parse() {
+    application_version_.application_ = "unknown";
+    application_version_.version = {0, 0, 0, "", "", ""};
+
+    if (!ParseApplicationName()) {
+      return;
+    }
+    if (!ParseVersion()) {
+      return;
+    }
+    if (!ParseBuildName()) {
+      return;
+    }
+  }
+
+ private:
+  void RemovePrecedingSpaces(const std::string& string, size_t& start,
+                             const size_t& end) {
+    while (start < end && string[start] == ' ') {

Review comment:
       Note that `\s` in a regex may match other whitespace characters. But they're unlikely to appear in the `created_by` field anyway.

##########
File path: cpp/src/parquet/metadata.cc
##########
@@ -949,43 +910,283 @@ ApplicationVersion::ApplicationVersion(std::string application, int major, int m
                                        int patch)
     : application_(std::move(application)), version{major, minor, patch, "", "", ""} {}
 
-ApplicationVersion::ApplicationVersion(const std::string& created_by) {
-  // Use singletons to compile only once (ARROW-9863)
-  static regex app_regex{ApplicationVersion::APPLICATION_FORMAT};
-  static regex ver_regex{ApplicationVersion::VERSION_FORMAT};
-  smatch app_matches;
-  smatch ver_matches;
-
-  std::string created_by_lower = created_by;
-  std::transform(created_by_lower.begin(), created_by_lower.end(),
-                 created_by_lower.begin(), ::tolower);
-
-  bool app_success = regex_match(created_by_lower, app_matches, app_regex);
-  bool ver_success = false;
-  std::string version_str;
-
-  if (app_success && app_matches.size() >= 4) {
-    // first match is the entire string. sub-matches start from second.
-    application_ = app_matches[1];
-    version_str = app_matches[3];
-    build_ = app_matches[4];
-    ver_success = regex_match(version_str, ver_matches, ver_regex);
-  } else {
-    application_ = "unknown";
-  }
-
-  if (ver_success && ver_matches.size() >= 7) {
-    version.major = atoi(ver_matches[1].str().c_str());
-    version.minor = atoi(ver_matches[2].str().c_str());
-    version.patch = atoi(ver_matches[3].str().c_str());
-    version.unknown = ver_matches[4].str();
-    version.pre_release = ver_matches[5].str();
-    version.build_info = ver_matches[6].str();
-  } else {
-    version.major = 0;
-    version.minor = 0;
-    version.patch = 0;
+namespace {
+// Parse the application version format and set parsed values to
+// ApplicationVersion.
+//
+// The application version format must be compatible parquet-mr's
+// one. See also:
+//   * https://github.com/apache/parquet-mr/blob/master/parquet-common/src/main/java/org/apache/parquet/VersionParser.java
+//   * https://github.com/apache/parquet-mr/blob/master/parquet-common/src/main/java/org/apache/parquet/SemanticVersion.java
+//
+// The application version format:
+//   "${APPLICATION_NAME}"
+//   "${APPLICATION_NAME} version ${VERSION}"
+//   "${APPLICATION_NAME} version ${VERSION} (build ${BUILD_NAME})"
+//
+// Eg:
+//   parquet-cpp
+//   parquet-cpp version 1.5.0ab-xyz5.5.0+cd
+//   parquet-cpp version 1.5.0ab-xyz5.5.0+cd (build abcd)
+//
+// The VERSION format:
+//   "${MAJOR}.${MINOR}.${PATCH}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}-${PRE_RELEASE}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}-${PRE_RELEASE}+${BUILD_INFO}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}+${BUILD_INFO}"
+//   "${MAJOR}.${MINOR}.${PATCH}-${PRE_RELEASE}"
+//   "${MAJOR}.${MINOR}.${PATCH}-${PRE_RELEASE}+${BUILD_INFO}"
+//   "${MAJOR}.${MINOR}.${PATCH}+${BUILD_INFO}"
+//
+// Eg:
+//   1.5.0
+//   1.5.0ab
+//   1.5.0ab-cdh5.5.0
+//   1.5.0ab-cdh5.5.0+cd
+//   1.5.0ab+cd
+//   1.5.0-cdh5.5.0
+//   1.5.0-cdh5.5.0+cd
+//   1.5.0+cd
+class ApplicationVersionParser {
+ public:
+  ApplicationVersionParser(const std::string& created_by,
+                           ApplicationVersion& application_version)
+      : created_by_(created_by),
+        application_version_(application_version),
+        digit_("0123456789") {}
+
+  void Parse() {
+    application_version_.application_ = "unknown";
+    application_version_.version = {0, 0, 0, "", "", ""};
+
+    if (!ParseApplicationName()) {
+      return;
+    }
+    if (!ParseVersion()) {
+      return;
+    }
+    if (!ParseBuildName()) {
+      return;
+    }
+  }
+
+ private:
+  void RemovePrecedingSpaces(const std::string& string, size_t& start,
+                             const size_t& end) {
+    while (start < end && string[start] == ' ') {
+      ++start;
+    }
+  }
+
+  void RemoveTrailingSpaces(const std::string& string, const size_t& start, size_t& end) {
+    while (start < (end - 1) && (end - 1) < string.size() && string[end - 1] == ' ') {
+      --end;
+    }
   }
+
+  bool ParseApplicationName() {
+    std::string version_mark(" version ");
+    auto version_mark_position = created_by_.find(version_mark);
+    size_t application_name_end;
+    // No VERSION and BUILD_NAME.
+    if (version_mark_position == std::string::npos) {
+      version_start_ = std::string::npos;
+      application_name_end = created_by_.size();
+    } else {
+      version_start_ = version_mark_position + version_mark.size();
+      application_name_end = version_mark_position;
+    }
+
+    size_t application_name_start = 0;
+    RemovePrecedingSpaces(created_by_, application_name_start, application_name_end);
+    RemoveTrailingSpaces(created_by_, application_name_start, application_name_end);
+    application_version_.application_ = created_by_.substr(
+        application_name_start, application_name_end - application_name_start);
+
+    return true;
+  }
+
+  bool ParseVersion() {
+    // No VERSION.
+    if (version_start_ == std::string::npos) {
+      return true;
+    }
+
+    RemovePrecedingSpaces(created_by_, version_start_, created_by_.size());
+    version_end_ = created_by_.find(" (", version_start_);
+    // No BUILD_NAME.
+    if (version_end_ == std::string::npos) {
+      version_end_ = created_by_.size();
+    }
+    RemoveTrailingSpaces(created_by_, version_start_, version_end_);
+    // No VERSION.
+    if (version_start_ == version_end_) {
+      return false;
+    }
+    version_string_ = created_by_.substr(version_start_, version_end_ - version_start_);
+
+    if (!ParseVersionMajor()) {
+      return false;
+    }
+    if (!ParseVersionMinor()) {
+      return false;
+    }
+    if (!ParseVersionPatch()) {
+      return false;
+    }
+    if (!ParseVersionUnknown()) {
+      return false;
+    }
+    if (!ParseVersionPreRelease()) {
+      return false;
+    }
+    if (!ParseVersionBuildInfo()) {
+      return false;
+    }
+
+    return true;
+  }
+
+  bool ParseVersionMajor() {
+    size_t version_major_start = 0;
+    auto version_major_end = version_string_.find_first_not_of(digit_);
+    // No ".".
+    if (version_major_end == std::string::npos ||
+        version_string_[version_major_end] != '.') {
+      return false;
+    }
+    // No MAJOR.
+    if (version_major_end == version_major_start) {
+      return false;
+    }
+    auto version_major_string = version_string_.substr(
+        version_major_start, version_major_end - version_major_start);
+    application_version_.version.major = atoi(version_major_string.c_str());
+    version_parsing_position_ = version_major_end + 1;  // +1 is for '.'.
+    return true;
+  }
+
+  bool ParseVersionMinor() {
+    auto version_minor_start = version_parsing_position_;
+    auto version_minor_end =
+        version_string_.find_first_not_of(digit_, version_minor_start);
+    if (version_minor_end == std::string::npos ||

Review comment:
       Same here if `npos`.

##########
File path: cpp/src/parquet/metadata.cc
##########
@@ -949,43 +910,283 @@ ApplicationVersion::ApplicationVersion(std::string application, int major, int m
                                        int patch)
     : application_(std::move(application)), version{major, minor, patch, "", "", ""} {}
 
-ApplicationVersion::ApplicationVersion(const std::string& created_by) {
-  // Use singletons to compile only once (ARROW-9863)
-  static regex app_regex{ApplicationVersion::APPLICATION_FORMAT};
-  static regex ver_regex{ApplicationVersion::VERSION_FORMAT};
-  smatch app_matches;
-  smatch ver_matches;
-
-  std::string created_by_lower = created_by;
-  std::transform(created_by_lower.begin(), created_by_lower.end(),
-                 created_by_lower.begin(), ::tolower);
-
-  bool app_success = regex_match(created_by_lower, app_matches, app_regex);
-  bool ver_success = false;
-  std::string version_str;
-
-  if (app_success && app_matches.size() >= 4) {
-    // first match is the entire string. sub-matches start from second.
-    application_ = app_matches[1];
-    version_str = app_matches[3];
-    build_ = app_matches[4];
-    ver_success = regex_match(version_str, ver_matches, ver_regex);
-  } else {
-    application_ = "unknown";
-  }
-
-  if (ver_success && ver_matches.size() >= 7) {
-    version.major = atoi(ver_matches[1].str().c_str());
-    version.minor = atoi(ver_matches[2].str().c_str());
-    version.patch = atoi(ver_matches[3].str().c_str());
-    version.unknown = ver_matches[4].str();
-    version.pre_release = ver_matches[5].str();
-    version.build_info = ver_matches[6].str();
-  } else {
-    version.major = 0;
-    version.minor = 0;
-    version.patch = 0;
+namespace {
+// Parse the application version format and set parsed values to
+// ApplicationVersion.
+//
+// The application version format must be compatible parquet-mr's
+// one. See also:
+//   * https://github.com/apache/parquet-mr/blob/master/parquet-common/src/main/java/org/apache/parquet/VersionParser.java
+//   * https://github.com/apache/parquet-mr/blob/master/parquet-common/src/main/java/org/apache/parquet/SemanticVersion.java
+//
+// The application version format:
+//   "${APPLICATION_NAME}"
+//   "${APPLICATION_NAME} version ${VERSION}"
+//   "${APPLICATION_NAME} version ${VERSION} (build ${BUILD_NAME})"
+//
+// Eg:
+//   parquet-cpp
+//   parquet-cpp version 1.5.0ab-xyz5.5.0+cd
+//   parquet-cpp version 1.5.0ab-xyz5.5.0+cd (build abcd)
+//
+// The VERSION format:
+//   "${MAJOR}.${MINOR}.${PATCH}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}-${PRE_RELEASE}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}-${PRE_RELEASE}+${BUILD_INFO}"
+//   "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}+${BUILD_INFO}"
+//   "${MAJOR}.${MINOR}.${PATCH}-${PRE_RELEASE}"
+//   "${MAJOR}.${MINOR}.${PATCH}-${PRE_RELEASE}+${BUILD_INFO}"
+//   "${MAJOR}.${MINOR}.${PATCH}+${BUILD_INFO}"
+//
+// Eg:
+//   1.5.0
+//   1.5.0ab
+//   1.5.0ab-cdh5.5.0
+//   1.5.0ab-cdh5.5.0+cd
+//   1.5.0ab+cd
+//   1.5.0-cdh5.5.0
+//   1.5.0-cdh5.5.0+cd
+//   1.5.0+cd
+class ApplicationVersionParser {
+ public:
+  ApplicationVersionParser(const std::string& created_by,
+                           ApplicationVersion& application_version)
+      : created_by_(created_by),
+        application_version_(application_version),
+        digit_("0123456789") {}
+
+  void Parse() {
+    application_version_.application_ = "unknown";
+    application_version_.version = {0, 0, 0, "", "", ""};
+
+    if (!ParseApplicationName()) {
+      return;
+    }
+    if (!ParseVersion()) {
+      return;
+    }
+    if (!ParseBuildName()) {
+      return;
+    }
+  }
+
+ private:
+  void RemovePrecedingSpaces(const std::string& string, size_t& start,
+                             const size_t& end) {
+    while (start < end && string[start] == ' ') {
+      ++start;
+    }
+  }
+
+  void RemoveTrailingSpaces(const std::string& string, const size_t& start, size_t& end) {
+    while (start < (end - 1) && (end - 1) < string.size() && string[end - 1] == ' ') {
+      --end;
+    }
   }
+
+  bool ParseApplicationName() {
+    std::string version_mark(" version ");
+    auto version_mark_position = created_by_.find(version_mark);
+    size_t application_name_end;
+    // No VERSION and BUILD_NAME.
+    if (version_mark_position == std::string::npos) {
+      version_start_ = std::string::npos;
+      application_name_end = created_by_.size();
+    } else {
+      version_start_ = version_mark_position + version_mark.size();
+      application_name_end = version_mark_position;
+    }
+
+    size_t application_name_start = 0;
+    RemovePrecedingSpaces(created_by_, application_name_start, application_name_end);
+    RemoveTrailingSpaces(created_by_, application_name_start, application_name_end);
+    application_version_.application_ = created_by_.substr(
+        application_name_start, application_name_end - application_name_start);
+
+    return true;
+  }
+
+  bool ParseVersion() {
+    // No VERSION.
+    if (version_start_ == std::string::npos) {
+      return true;
+    }
+
+    RemovePrecedingSpaces(created_by_, version_start_, created_by_.size());
+    version_end_ = created_by_.find(" (", version_start_);
+    // No BUILD_NAME.
+    if (version_end_ == std::string::npos) {
+      version_end_ = created_by_.size();
+    }
+    RemoveTrailingSpaces(created_by_, version_start_, version_end_);
+    // No VERSION.
+    if (version_start_ == version_end_) {
+      return false;
+    }
+    version_string_ = created_by_.substr(version_start_, version_end_ - version_start_);
+
+    if (!ParseVersionMajor()) {
+      return false;
+    }
+    if (!ParseVersionMinor()) {
+      return false;
+    }
+    if (!ParseVersionPatch()) {
+      return false;
+    }
+    if (!ParseVersionUnknown()) {
+      return false;
+    }
+    if (!ParseVersionPreRelease()) {
+      return false;
+    }
+    if (!ParseVersionBuildInfo()) {
+      return false;
+    }
+
+    return true;
+  }
+
+  bool ParseVersionMajor() {
+    size_t version_major_start = 0;
+    auto version_major_end = version_string_.find_first_not_of(digit_);
+    // No ".".
+    if (version_major_end == std::string::npos ||

Review comment:
       If it's `npos`, then we can still parse the major number, no?

##########
File path: cpp/src/parquet/metadata_test.cc
##########
@@ -342,5 +342,123 @@ TEST(ApplicationVersion, Basics) {
       version1.HasCorrectStatistics(Type::BYTE_ARRAY, stats_int, SortOrder::UNSIGNED));
 }
 
+TEST(ApplicationVersion, Empty) {
+  ApplicationVersion version("");
+
+  ASSERT_EQ("", version.application_);
+  ASSERT_EQ("", version.build_);
+  ASSERT_EQ(0, version.version.major);
+  ASSERT_EQ(0, version.version.minor);
+  ASSERT_EQ(0, version.version.patch);
+  ASSERT_EQ("", version.version.unknown);
+  ASSERT_EQ("", version.version.pre_release);
+  ASSERT_EQ("", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionEmpty) {
+  ApplicationVersion version("parquet-mr version ");
+
+  ASSERT_EQ("parquet-mr", version.application_);
+  ASSERT_EQ("", version.build_);
+  ASSERT_EQ(0, version.version.major);
+  ASSERT_EQ(0, version.version.minor);
+  ASSERT_EQ(0, version.version.patch);
+  ASSERT_EQ("", version.version.unknown);
+  ASSERT_EQ("", version.version.pre_release);
+  ASSERT_EQ("", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionNoMajor) {
+  ApplicationVersion version("parquet-mr version .");
+
+  ASSERT_EQ("parquet-mr", version.application_);
+  ASSERT_EQ("", version.build_);
+  ASSERT_EQ(0, version.version.major);
+  ASSERT_EQ(0, version.version.minor);
+  ASSERT_EQ(0, version.version.patch);
+  ASSERT_EQ("", version.version.unknown);
+  ASSERT_EQ("", version.version.pre_release);
+  ASSERT_EQ("", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionNoMinor) {
+  ApplicationVersion version("parquet-mr version 1.");
+
+  ASSERT_EQ("parquet-mr", version.application_);
+  ASSERT_EQ("", version.build_);
+  ASSERT_EQ(1, version.version.major);
+  ASSERT_EQ(0, version.version.minor);
+  ASSERT_EQ(0, version.version.patch);
+  ASSERT_EQ("", version.version.unknown);
+  ASSERT_EQ("", version.version.pre_release);
+  ASSERT_EQ("", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionNoPatch) {
+  ApplicationVersion version("parquet-mr version 1.7.");
+
+  ASSERT_EQ("parquet-mr", version.application_);
+  ASSERT_EQ("", version.build_);
+  ASSERT_EQ(1, version.version.major);
+  ASSERT_EQ(7, version.version.minor);
+  ASSERT_EQ(0, version.version.patch);
+  ASSERT_EQ("", version.version.unknown);
+  ASSERT_EQ("", version.version.pre_release);
+  ASSERT_EQ("", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionNoUnknown) {
+  ApplicationVersion version("parquet-mr version 1.7.9-cdh5.5.0+cd");
+
+  ASSERT_EQ("parquet-mr", version.application_);
+  ASSERT_EQ("", version.build_);
+  ASSERT_EQ(1, version.version.major);
+  ASSERT_EQ(7, version.version.minor);
+  ASSERT_EQ(9, version.version.patch);
+  ASSERT_EQ("", version.version.unknown);
+  ASSERT_EQ("cdh5.5.0", version.version.pre_release);
+  ASSERT_EQ("cd", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionNoPreRelease) {
+  ApplicationVersion version("parquet-mr version 1.7.9ab+cd");
+
+  ASSERT_EQ("parquet-mr", version.application_);
+  ASSERT_EQ("", version.build_);
+  ASSERT_EQ(1, version.version.major);
+  ASSERT_EQ(7, version.version.minor);
+  ASSERT_EQ(9, version.version.patch);
+  ASSERT_EQ("ab", version.version.unknown);
+  ASSERT_EQ("", version.version.pre_release);
+  ASSERT_EQ("cd", version.version.build_info);
+}
+
+TEST(ApplicationVersion, VersionNoUnknownNoPreRelease) {
+  ApplicationVersion version("parquet-mr version 1.7.9+cd");
+
+  ASSERT_EQ("parquet-mr", version.application_);
+  ASSERT_EQ("", version.build_);
+  ASSERT_EQ(1, version.version.major);
+  ASSERT_EQ(7, version.version.minor);
+  ASSERT_EQ(9, version.version.patch);
+  ASSERT_EQ("", version.version.unknown);
+  ASSERT_EQ("", version.version.pre_release);
+  ASSERT_EQ("cd", version.version.build_info);
+}
+
+TEST(ApplicationVersion, FullWithSpaces) {
+  ApplicationVersion version(
+      " parquet-mr  version  1.5.3ab-cdh5.5.0+cd  (build  abcd ) ");
+
+  ASSERT_EQ("parquet-mr", version.application_);
+  ASSERT_EQ("abcd", version.build_);
+  ASSERT_EQ(1, version.version.major);
+  ASSERT_EQ(5, version.version.minor);
+  ASSERT_EQ(3, version.version.patch);
+  ASSERT_EQ("ab", version.version.unknown);
+  ASSERT_EQ("cdh5.5.0", version.version.pre_release);
+  ASSERT_EQ("cd", version.version.build_info);
+}
+

Review comment:
       Can you add a test with a malformed version string?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org