You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by ki...@apache.org on 2015/04/20 05:41:29 UTC
[text] SANDBOX-498 Add parser options and initialise regular expressions once

Repository: commons-text
Updated Branches:
  refs/heads/SANDBOX-498-OPTIONS [created] 331f80bfc


SANDBOX-498 Add parser options and initialise regular expressions once


Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/331f80bf
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/331f80bf
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/331f80bf

Branch: refs/heads/SANDBOX-498-OPTIONS
Commit: 331f80bfcf0380fcc35a6d18a327aef4a9e844e4
Parents: bf8bfb0
Author: Bruno P. Kinoshita <br...@yahoo.com.br>
Authored: Mon Apr 20 15:41:05 2015 +1200
Committer: Bruno P. Kinoshita <br...@yahoo.com.br>
Committed: Mon Apr 20 15:41:09 2015 +1200

----------------------------------------------------------------------
 .../commons/text/names/HumanNameParser.java     | 73 ++++++++++++--------
 .../commons/text/names/ParserOptions.java       | 59 ++++++++++++++++
 2 files changed, 102 insertions(+), 30 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/commons-text/blob/331f80bf/src/main/java/org/apache/commons/text/names/HumanNameParser.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/names/HumanNameParser.java b/src/main/java/org/apache/commons/text/names/HumanNameParser.java
index 5407d15..e7a3927 100644
--- a/src/main/java/org/apache/commons/text/names/HumanNameParser.java
+++ b/src/main/java/org/apache/commons/text/names/HumanNameParser.java
@@ -17,8 +17,6 @@
  */
 package org.apache.commons.text.names;
 
-import java.util.Arrays;
-import java.util.List;
 import java.util.Objects;
 
 import org.apache.commons.lang3.StringUtils;
@@ -100,22 +98,51 @@ import org.apache.commons.lang3.StringUtils;
  */
 public final class HumanNameParser {
 
-    private final List<String> suffixes;
-    private final List<String> prefixes;
+    /**
+     * The options used by the parser.
+     */
+    private final ParserOptions options;
+
+    /*
+     * Regular expressions used by the parser.
+     */
 
+    // The regex use is a bit tricky.  *Everything* matched by the regex will be replaced,
+    // but you can select a particular parenthesized submatch to be returned.
+    // Also, note that each regex requres that the preceding ones have been run, and matches chopped out.
+    // names that starts or end w/ an apostrophe break this
+    private final static String NICKNAMES_REGEX = "(?i) ('|\\\"|\\(\\\"*'*)(.+?)('|\\\"|\\\"*'*\\)) ";
+    // note the lookahead, which isn't returned or replaced
+    private final static String LEADING_INIT_REGEX = "(?i)(^(.\\.*)(?= \\p{L}{2}))";
+    private final static String FIRST_NAME_REGEX = "(?i)^([^ ]+)";
+    private final String suffixRegex;
+    private final String lastRegex;
+    
     /**
      * Creates a new parser.
      */
     public HumanNameParser() {
-        // TODO make this configurable
-        this.suffixes = Arrays.asList(
-                "esq", "esquire", "jr",
-                "sr", "2", "ii", "iii", "iv");
-        this.prefixes = Arrays.asList(
-                    "bar", "ben", "bin", "da", "dal",
-                    "de la", "de", "del", "der", "di", "ibn", "la", "le",
-                    "san", "st", "ste", "van", "van der", "van den", "vel",
-                    "von" );
+        this(ParserOptions.DEFAULT_OPTIONS);
+    }
+
+    /**
+     * Creates a new parser by providing options.
+     */
+    public HumanNameParser(ParserOptions options) {
+        this.options = options;
+        final String suffixes = StringUtils.join(options.getSuffixes(), "\\.*|") + "\\.*";
+        final String prefixes = StringUtils.join(options.getPrefixes(), " |") + " ";
+        suffixRegex = "(?i),* *((" + suffixes + ")$)";
+        lastRegex = "(?i)(?!^)\\b([^ ]+ y |" + prefixes + ")*[^ ]+$";
+    }
+
+    /**
+     * Gets the parser options.
+     *
+     * @return parser options
+     */
+    public ParserOptions getOptions() {
+        return options;
     }
 
     /**
@@ -129,23 +156,9 @@ public final class HumanNameParser {
         Objects.requireNonNull(name, "Parameter 'name' must not be null.");
 
         NameString nameString = new NameString(name);
-        // TODO compile regexes only once when the parser is created
-        String suffixes = StringUtils.join(this.suffixes, "\\.*|") + "\\.*";
-        String prefixes = StringUtils.join(this.prefixes, " |") + " ";
-
-        // The regex use is a bit tricky.  *Everything* matched by the regex will be replaced,
-        // but you can select a particular parenthesized submatch to be returned.
-        // Also, note that each regex requres that the preceding ones have been run, and matches chopped out.
-        // names that starts or end w/ an apostrophe break this
-        String nicknamesRegex = "(?i) ('|\\\"|\\(\\\"*'*)(.+?)('|\\\"|\\\"*'*\\)) ";
-        String suffixRegex = "(?i),* *((" + suffixes + ")$)";
-        String lastRegex = "(?i)(?!^)\\b([^ ]+ y |" + prefixes + ")*[^ ]+$";
-        // note the lookahead, which isn't returned or replaced
-        String leadingInitRegex = "(?i)(^(.\\.*)(?= \\p{L}{2}))";
-        String firstRegex = "(?i)^([^ ]+)";
 
         // get nickname, if there is one
-        String nickname = nameString.chopWithRegex(nicknamesRegex, 2);
+        String nickname = nameString.chopWithRegex(NICKNAMES_REGEX, 2);
 
         // get suffix, if there is one
         String suffix = nameString.chopWithRegex(suffixRegex, 1);
@@ -157,10 +170,10 @@ public final class HumanNameParser {
         String last = nameString.chopWithRegex(lastRegex, 0);
 
         // get the first initial, if there is one
-        String leadingInit = nameString.chopWithRegex(leadingInitRegex, 1);
+        String leadingInit = nameString.chopWithRegex(LEADING_INIT_REGEX, 1);
 
         // get the first name
-        String first = nameString.chopWithRegex(firstRegex, 0);
+        String first = nameString.chopWithRegex(FIRST_NAME_REGEX, 0);
         if (StringUtils.isBlank(first)) {
             throw new NameParseException("Couldn't find a first name in '{" + nameString.getWrappedString() + "}'");
         }

http://git-wip-us.apache.org/repos/asf/commons-text/blob/331f80bf/src/main/java/org/apache/commons/text/names/ParserOptions.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/names/ParserOptions.java b/src/main/java/org/apache/commons/text/names/ParserOptions.java
new file mode 100644
index 0000000..6bca771
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/names/ParserOptions.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.names;
+
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * Options for the {@link HumanNameParser} parser.
+ */
+public final class ParserOptions {
+
+    public static final ParserOptions DEFAULT_OPTIONS = new ParserOptions();
+
+    private final Set<String> suffixes;
+
+    private final Set<String> prefixes;
+
+    public ParserOptions() {
+        this.suffixes = new HashSet<String>(Arrays.asList(
+                "esq", "esquire", "jr",
+                "sr", "2", "ii", "iii", "iv"));
+        this.prefixes = new HashSet<String>(Arrays.asList(
+                "bar", "ben", "bin", "da", "dal",
+                "de la", "de", "del", "der", "di", "ibn", "la", "le",
+                "san", "st", "ste", "van", "van der", "van den", "vel",
+                "von"));
+    }
+
+    /**
+     * @return the suffixes
+     */
+    public Set<String> getSuffixes() {
+        return suffixes;
+    }
+
+    /**
+     * @return the prefixes
+     */
+    public Set<String> getPrefixes() {
+        return prefixes;
+    }
+
+}