You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by ki...@apache.org on 2015/04/20 05:41:29 UTC
[text] SANDBOX-498 Add parser options and initialise regular
expressions once
Repository: commons-text
Updated Branches:
refs/heads/SANDBOX-498-OPTIONS [created] 331f80bfc
SANDBOX-498 Add parser options and initialise regular expressions once
Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/331f80bf
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/331f80bf
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/331f80bf
Branch: refs/heads/SANDBOX-498-OPTIONS
Commit: 331f80bfcf0380fcc35a6d18a327aef4a9e844e4
Parents: bf8bfb0
Author: Bruno P. Kinoshita <br...@yahoo.com.br>
Authored: Mon Apr 20 15:41:05 2015 +1200
Committer: Bruno P. Kinoshita <br...@yahoo.com.br>
Committed: Mon Apr 20 15:41:09 2015 +1200
----------------------------------------------------------------------
.../commons/text/names/HumanNameParser.java | 73 ++++++++++++--------
.../commons/text/names/ParserOptions.java | 59 ++++++++++++++++
2 files changed, 102 insertions(+), 30 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/commons-text/blob/331f80bf/src/main/java/org/apache/commons/text/names/HumanNameParser.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/names/HumanNameParser.java b/src/main/java/org/apache/commons/text/names/HumanNameParser.java
index 5407d15..e7a3927 100644
--- a/src/main/java/org/apache/commons/text/names/HumanNameParser.java
+++ b/src/main/java/org/apache/commons/text/names/HumanNameParser.java
@@ -17,8 +17,6 @@
*/
package org.apache.commons.text.names;
-import java.util.Arrays;
-import java.util.List;
import java.util.Objects;
import org.apache.commons.lang3.StringUtils;
@@ -100,22 +98,51 @@ import org.apache.commons.lang3.StringUtils;
*/
public final class HumanNameParser {
- private final List<String> suffixes;
- private final List<String> prefixes;
+ /**
+ * The options used by the parser.
+ */
+ private final ParserOptions options;
+
+ /*
+ * Regular expressions used by the parser.
+ */
+ // The regex use is a bit tricky. *Everything* matched by the regex will be replaced,
+ // but you can select a particular parenthesized submatch to be returned.
+ // Also, note that each regex requres that the preceding ones have been run, and matches chopped out.
+ // names that starts or end w/ an apostrophe break this
+ private final static String NICKNAMES_REGEX = "(?i) ('|\\\"|\\(\\\"*'*)(.+?)('|\\\"|\\\"*'*\\)) ";
+ // note the lookahead, which isn't returned or replaced
+ private final static String LEADING_INIT_REGEX = "(?i)(^(.\\.*)(?= \\p{L}{2}))";
+ private final static String FIRST_NAME_REGEX = "(?i)^([^ ]+)";
+ private final String suffixRegex;
+ private final String lastRegex;
+
/**
* Creates a new parser.
*/
public HumanNameParser() {
- // TODO make this configurable
- this.suffixes = Arrays.asList(
- "esq", "esquire", "jr",
- "sr", "2", "ii", "iii", "iv");
- this.prefixes = Arrays.asList(
- "bar", "ben", "bin", "da", "dal",
- "de la", "de", "del", "der", "di", "ibn", "la", "le",
- "san", "st", "ste", "van", "van der", "van den", "vel",
- "von" );
+ this(ParserOptions.DEFAULT_OPTIONS);
+ }
+
+ /**
+ * Creates a new parser by providing options.
+ */
+ public HumanNameParser(ParserOptions options) {
+ this.options = options;
+ final String suffixes = StringUtils.join(options.getSuffixes(), "\\.*|") + "\\.*";
+ final String prefixes = StringUtils.join(options.getPrefixes(), " |") + " ";
+ suffixRegex = "(?i),* *((" + suffixes + ")$)";
+ lastRegex = "(?i)(?!^)\\b([^ ]+ y |" + prefixes + ")*[^ ]+$";
+ }
+
+ /**
+ * Gets the parser options.
+ *
+ * @return parser options
+ */
+ public ParserOptions getOptions() {
+ return options;
}
/**
@@ -129,23 +156,9 @@ public final class HumanNameParser {
Objects.requireNonNull(name, "Parameter 'name' must not be null.");
NameString nameString = new NameString(name);
- // TODO compile regexes only once when the parser is created
- String suffixes = StringUtils.join(this.suffixes, "\\.*|") + "\\.*";
- String prefixes = StringUtils.join(this.prefixes, " |") + " ";
-
- // The regex use is a bit tricky. *Everything* matched by the regex will be replaced,
- // but you can select a particular parenthesized submatch to be returned.
- // Also, note that each regex requres that the preceding ones have been run, and matches chopped out.
- // names that starts or end w/ an apostrophe break this
- String nicknamesRegex = "(?i) ('|\\\"|\\(\\\"*'*)(.+?)('|\\\"|\\\"*'*\\)) ";
- String suffixRegex = "(?i),* *((" + suffixes + ")$)";
- String lastRegex = "(?i)(?!^)\\b([^ ]+ y |" + prefixes + ")*[^ ]+$";
- // note the lookahead, which isn't returned or replaced
- String leadingInitRegex = "(?i)(^(.\\.*)(?= \\p{L}{2}))";
- String firstRegex = "(?i)^([^ ]+)";
// get nickname, if there is one
- String nickname = nameString.chopWithRegex(nicknamesRegex, 2);
+ String nickname = nameString.chopWithRegex(NICKNAMES_REGEX, 2);
// get suffix, if there is one
String suffix = nameString.chopWithRegex(suffixRegex, 1);
@@ -157,10 +170,10 @@ public final class HumanNameParser {
String last = nameString.chopWithRegex(lastRegex, 0);
// get the first initial, if there is one
- String leadingInit = nameString.chopWithRegex(leadingInitRegex, 1);
+ String leadingInit = nameString.chopWithRegex(LEADING_INIT_REGEX, 1);
// get the first name
- String first = nameString.chopWithRegex(firstRegex, 0);
+ String first = nameString.chopWithRegex(FIRST_NAME_REGEX, 0);
if (StringUtils.isBlank(first)) {
throw new NameParseException("Couldn't find a first name in '{" + nameString.getWrappedString() + "}'");
}
http://git-wip-us.apache.org/repos/asf/commons-text/blob/331f80bf/src/main/java/org/apache/commons/text/names/ParserOptions.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/names/ParserOptions.java b/src/main/java/org/apache/commons/text/names/ParserOptions.java
new file mode 100644
index 0000000..6bca771
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/names/ParserOptions.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.names;
+
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * Options for the {@link HumanNameParser} parser.
+ */
+public final class ParserOptions {
+
+ public static final ParserOptions DEFAULT_OPTIONS = new ParserOptions();
+
+ private final Set<String> suffixes;
+
+ private final Set<String> prefixes;
+
+ public ParserOptions() {
+ this.suffixes = new HashSet<String>(Arrays.asList(
+ "esq", "esquire", "jr",
+ "sr", "2", "ii", "iii", "iv"));
+ this.prefixes = new HashSet<String>(Arrays.asList(
+ "bar", "ben", "bin", "da", "dal",
+ "de la", "de", "del", "der", "di", "ibn", "la", "le",
+ "san", "st", "ste", "van", "van der", "van den", "vel",
+ "von"));
+ }
+
+ /**
+ * @return the suffixes
+ */
+ public Set<String> getSuffixes() {
+ return suffixes;
+ }
+
+ /**
+ * @return the prefixes
+ */
+ public Set<String> getPrefixes() {
+ return prefixes;
+ }
+
+}