You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2017/07/06 23:03:19 UTC
[3/3] lucene-solr:branch_7_0: LUCENE-7773: Remove unused/deprecated
token types from StandardTokenizer
LUCENE-7773: Remove unused/deprecated token types from StandardTokenizer
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/f7ce9d67
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/f7ce9d67
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/f7ce9d67
Branch: refs/heads/branch_7_0
Commit: f7ce9d679cb0cb25d2fbe2530b10c076ca8ebc68
Parents: c976664
Author: Steve Rowe <sa...@apache.org>
Authored: Thu Jul 6 19:02:11 2017 -0400
Committer: Steve Rowe <sa...@apache.org>
Committed: Thu Jul 6 19:02:46 2017 -0400
----------------------------------------------------------------------
lucene/CHANGES.txt | 3 +
.../lucene/analysis/charfilter/htmlentity.py | 66 ++++++++++----------
.../analysis/standard/ClassicTokenizerImpl.java | 23 +++----
.../standard/ClassicTokenizerImpl.jflex | 23 ++++---
.../lucene/analysis/core/TestAnalyzers.java | 11 ++--
.../analysis/standard/StandardTokenizer.java | 47 +++-----------
.../standard/StandardTokenizerImpl.java | 3 +-
.../standard/StandardTokenizerImpl.jflex | 2 +-
8 files changed, 75 insertions(+), 103 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f7ce9d67/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 1097010..7ea2aa2 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -179,6 +179,9 @@ Other
* LUCENE-5822: Convert README to Markdown (Jason Gerlowski via Mike Drob)
+* LUCENE-7773: Remove unused/deprecated token types from StandardTokenizer.
+ (Ahmet Arslan via Steve Rowe)
+
======================= Lucene 6.7.0 =======================
New Features
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f7ce9d67/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/htmlentity.py
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/htmlentity.py b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/htmlentity.py
index 94de1ac..3f28e82 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/htmlentity.py
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/htmlentity.py
@@ -19,7 +19,7 @@ import re
# for inclusion in HTMLStripCharFilter.jflex.
def main():
- print get_apache_license()
+ print(get_apache_license())
codes = {}
regex = re.compile(r'\s*<!ENTITY\s+(\S+)\s+"&(?:#38;)?#(\d+);"')
for line in get_entity_text().split('\n'):
@@ -38,51 +38,51 @@ def main():
new_entry = ('"%s"' if first_entry else ' | "%s"') % key
first_entry = False
if len(output_line) + len(new_entry) >= 80:
- print output_line
+ print(output_line)
output_line = ' '
output_line += new_entry
if key in ('quot','copy','gt','lt','reg','amp'):
new_entry = ' | "%s"' % key.upper()
if len(output_line) + len(new_entry) >= 80:
- print output_line
+ print(output_line)
output_line = ' '
output_line += new_entry
- print output_line, ')'
-
- print '%{'
- print ' private static final Map<String,String> upperCaseVariantsAccepted'
- print ' = new HashMap<>();'
- print ' static {'
- print ' upperCaseVariantsAccepted.put("quot", "QUOT");'
- print ' upperCaseVariantsAccepted.put("copy", "COPY");'
- print ' upperCaseVariantsAccepted.put("gt", "GT");'
- print ' upperCaseVariantsAccepted.put("lt", "LT");'
- print ' upperCaseVariantsAccepted.put("reg", "REG");'
- print ' upperCaseVariantsAccepted.put("amp", "AMP");'
- print ' }'
- print ' private static final CharArrayMap<Character> entityValues'
- print ' = new CharArrayMap<>(%i, false);' % len(keys)
- print ' static {'
- print ' String[] entities = {'
+ print(output_line, ')')
+
+ print('%{')
+ print(' private static final Map<String,String> upperCaseVariantsAccepted')
+ print(' = new HashMap<>();')
+ print(' static {')
+ print(' upperCaseVariantsAccepted.put("quot", "QUOT");')
+ print(' upperCaseVariantsAccepted.put("copy", "COPY");')
+ print(' upperCaseVariantsAccepted.put("gt", "GT");')
+ print(' upperCaseVariantsAccepted.put("lt", "LT");')
+ print(' upperCaseVariantsAccepted.put("reg", "REG");')
+ print(' upperCaseVariantsAccepted.put("amp", "AMP");')
+ print(' }')
+ print(' private static final CharArrayMap<Character> entityValues')
+ print(' = new CharArrayMap<>(%i, false);' % len(keys))
+ print(' static {')
+ print(' String[] entities = {')
output_line = ' '
for key in keys:
new_entry = ' "%s", "%s",' % (key, codes[key])
if len(output_line) + len(new_entry) >= 80:
- print output_line
+ print(output_line)
output_line = ' '
output_line += new_entry
- print output_line[:-1]
- print ' };'
- print ' for (int i = 0 ; i < entities.length ; i += 2) {'
- print ' Character value = entities[i + 1].charAt(0);'
- print ' entityValues.put(entities[i], value);'
- print ' String upperCaseVariant = upperCaseVariantsAccepted.get(entities[i]);'
- print ' if (upperCaseVariant != null) {'
- print ' entityValues.put(upperCaseVariant, value);'
- print ' }'
- print ' }'
- print " }"
- print "%}"
+ print(output_line[:-1])
+ print(' };')
+ print(' for (int i = 0 ; i < entities.length ; i += 2) {')
+ print(' Character value = entities[i + 1].charAt(0);')
+ print(' entityValues.put(entities[i], value);')
+ print(' String upperCaseVariant = upperCaseVariantsAccepted.get(entities[i]);')
+ print(' if (upperCaseVariant != null) {')
+ print(' entityValues.put(upperCaseVariant, value);')
+ print(' }')
+ print(' }')
+ print(" }")
+ print("%}")
def get_entity_text():
# The text below is taken verbatim from
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f7ce9d67/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java
index 7311459..7e5105d 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java
@@ -16,6 +16,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+
package org.apache.lucene.analysis.standard;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
@@ -342,17 +343,17 @@ class ClassicTokenizerImpl {
/* user code: */
-public static final int ALPHANUM = StandardTokenizer.ALPHANUM;
-public static final int APOSTROPHE = StandardTokenizer.APOSTROPHE;
-public static final int ACRONYM = StandardTokenizer.ACRONYM;
-public static final int COMPANY = StandardTokenizer.COMPANY;
-public static final int EMAIL = StandardTokenizer.EMAIL;
-public static final int HOST = StandardTokenizer.HOST;
-public static final int NUM = StandardTokenizer.NUM;
-public static final int CJ = StandardTokenizer.CJ;
-public static final int ACRONYM_DEP = StandardTokenizer.ACRONYM_DEP;
-
-public static final String [] TOKEN_TYPES = StandardTokenizer.TOKEN_TYPES;
+public static final int ALPHANUM = ClassicTokenizer.ALPHANUM;
+public static final int APOSTROPHE = ClassicTokenizer.APOSTROPHE;
+public static final int ACRONYM = ClassicTokenizer.ACRONYM;
+public static final int COMPANY = ClassicTokenizer.COMPANY;
+public static final int EMAIL = ClassicTokenizer.EMAIL;
+public static final int HOST = ClassicTokenizer.HOST;
+public static final int NUM = ClassicTokenizer.NUM;
+public static final int CJ = ClassicTokenizer.CJ;
+public static final int ACRONYM_DEP = ClassicTokenizer.ACRONYM_DEP;
+
+public static final String [] TOKEN_TYPES = ClassicTokenizer.TOKEN_TYPES;
public final int yychar()
{
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f7ce9d67/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex
index 4d6ad16..07d7857 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex
@@ -17,7 +17,6 @@
package org.apache.lucene.analysis.standard;
-import java.io.Reader;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
@@ -36,17 +35,17 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
%{
-public static final int ALPHANUM = StandardTokenizer.ALPHANUM;
-public static final int APOSTROPHE = StandardTokenizer.APOSTROPHE;
-public static final int ACRONYM = StandardTokenizer.ACRONYM;
-public static final int COMPANY = StandardTokenizer.COMPANY;
-public static final int EMAIL = StandardTokenizer.EMAIL;
-public static final int HOST = StandardTokenizer.HOST;
-public static final int NUM = StandardTokenizer.NUM;
-public static final int CJ = StandardTokenizer.CJ;
-public static final int ACRONYM_DEP = StandardTokenizer.ACRONYM_DEP;
-
-public static final String [] TOKEN_TYPES = StandardTokenizer.TOKEN_TYPES;
+public static final int ALPHANUM = ClassicTokenizer.ALPHANUM;
+public static final int APOSTROPHE = ClassicTokenizer.APOSTROPHE;
+public static final int ACRONYM = ClassicTokenizer.ACRONYM;
+public static final int COMPANY = ClassicTokenizer.COMPANY;
+public static final int EMAIL = ClassicTokenizer.EMAIL;
+public static final int HOST = ClassicTokenizer.HOST;
+public static final int NUM = ClassicTokenizer.NUM;
+public static final int CJ = ClassicTokenizer.CJ;
+public static final int ACRONYM_DEP = ClassicTokenizer.ACRONYM_DEP;
+
+public static final String [] TOKEN_TYPES = ClassicTokenizer.TOKEN_TYPES;
public final int yychar()
{
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f7ce9d67/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java
index 6d514d1..c9ae2e6 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java
@@ -121,13 +121,12 @@ public class TestAnalyzers extends BaseTokenStreamTestCase {
@SuppressWarnings("unused")
public void _testStandardConstants() {
int x = StandardTokenizer.ALPHANUM;
- x = StandardTokenizer.APOSTROPHE;
- x = StandardTokenizer.ACRONYM;
- x = StandardTokenizer.COMPANY;
- x = StandardTokenizer.EMAIL;
- x = StandardTokenizer.HOST;
x = StandardTokenizer.NUM;
- x = StandardTokenizer.CJ;
+ x = StandardTokenizer.SOUTHEAST_ASIAN;
+ x = StandardTokenizer.IDEOGRAPHIC;
+ x = StandardTokenizer.HIRAGANA;
+ x = StandardTokenizer.KATAKANA;
+ x = StandardTokenizer.HANGUL;
String[] y = StandardTokenizer.TOKEN_TYPES;
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f7ce9d67/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
index ed52f03..0410124 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
@@ -40,56 +40,25 @@ public final class StandardTokenizer extends Tokenizer {
/** A private instance of the JFlex-constructed scanner */
private StandardTokenizerImpl scanner;
- // TODO: how can we remove these old types?!
/** Alpha/numeric token type */
- public static final int ALPHANUM = 0;
- /** @deprecated (3.1) */
- @Deprecated
- public static final int APOSTROPHE = 1;
- /** @deprecated (3.1) */
- @Deprecated
- public static final int ACRONYM = 2;
- /** @deprecated (3.1) */
- @Deprecated
- public static final int COMPANY = 3;
- /** Email token type */
- public static final int EMAIL = 4;
- /** @deprecated (3.1) */
- @Deprecated
- public static final int HOST = 5;
+ public static final int ALPHANUM = 0;
/** Numeric token type */
- public static final int NUM = 6;
- /** @deprecated (3.1) */
- @Deprecated
- public static final int CJ = 7;
-
- /** @deprecated (3.1) */
- @Deprecated
- public static final int ACRONYM_DEP = 8;
-
+ public static final int NUM = 1;
/** Southeast Asian token type */
- public static final int SOUTHEAST_ASIAN = 9;
- /** Idiographic token type */
- public static final int IDEOGRAPHIC = 10;
+ public static final int SOUTHEAST_ASIAN = 2;
+ /** Ideographic token type */
+ public static final int IDEOGRAPHIC = 3;
/** Hiragana token type */
- public static final int HIRAGANA = 11;
+ public static final int HIRAGANA = 4;
/** Katakana token type */
- public static final int KATAKANA = 12;
-
+ public static final int KATAKANA = 5;
/** Hangul token type */
- public static final int HANGUL = 13;
+ public static final int HANGUL = 6;
/** String token types that correspond to token type int constants */
public static final String [] TOKEN_TYPES = new String [] {
"<ALPHANUM>",
- "<APOSTROPHE>",
- "<ACRONYM>",
- "<COMPANY>",
- "<EMAIL>",
- "<HOST>",
"<NUM>",
- "<CJ>",
- "<ACRONYM_DEP>",
"<SOUTHEAST_ASIAN>",
"<IDEOGRAPHIC>",
"<HIRAGANA>",
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f7ce9d67/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
index 5d7b240..8b288c2 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
@@ -16,6 +16,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+
package org.apache.lucene.analysis.standard;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
@@ -435,7 +436,7 @@ public final class StandardTokenizerImpl {
*/
public static final int SOUTH_EAST_ASIAN_TYPE = StandardTokenizer.SOUTHEAST_ASIAN;
- /** Idiographic token type */
+ /** Ideographic token type */
public static final int IDEOGRAPHIC_TYPE = StandardTokenizer.IDEOGRAPHIC;
/** Hiragana token type */
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f7ce9d67/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
index 11b2cbd..a1e7b17 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
@@ -82,7 +82,7 @@ ComplexContextEx = \p{LB:Complex_Context}
*/
public static final int SOUTH_EAST_ASIAN_TYPE = StandardTokenizer.SOUTHEAST_ASIAN;
- /** Idiographic token type */
+ /** Ideographic token type */
public static final int IDEOGRAPHIC_TYPE = StandardTokenizer.IDEOGRAPHIC;
/** Hiragana token type */