You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2014/03/20 01:43:49 UTC
svn commit: r1579491 - in /lucene/dev/branches/branch_4x: ./ lucene/
lucene/analysis/ lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/
lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/
Author: rmuir
Date: Thu Mar 20 00:43:48 2014
New Revision: 1579491
URL: http://svn.apache.org/r1579491
Log:
LUCENE-4072: add ICUNormalizer2CharFilter
Added:
lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java
- copied unchanged from r1579488, lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java
lucene/dev/branches/branch_4x/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java
- copied, changed from r1579488, lucene/dev/trunk/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java
Modified:
lucene/dev/branches/branch_4x/ (props changed)
lucene/dev/branches/branch_4x/lucene/ (props changed)
lucene/dev/branches/branch_4x/lucene/CHANGES.txt
lucene/dev/branches/branch_4x/lucene/analysis/ (props changed)
Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1579491&r1=1579490&r2=1579491&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Thu Mar 20 00:43:48 2014
@@ -59,6 +59,9 @@ New Features
you update the value of a BinaryDocValuesField without reindexing the
document(s). (Shai Erera)
+* LUCENE-4072: Add ICUNormalizer2CharFilter, which lets you do unicode normalization
+ with offset correction before the tokenizer. (David Goldfarb, Ippei UKAI via Robert Muir)
+
API Changes
* LUCENE-5454: Add RandomAccessOrds, an optional extension of SortedSetDocValues
Copied: lucene/dev/branches/branch_4x/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java (from r1579488, lucene/dev/trunk/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java?p2=lucene/dev/branches/branch_4x/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java&p1=lucene/dev/trunk/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java&r1=1579488&r2=1579491&rev=1579491&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java Thu Mar 20 00:43:48 2014
@@ -60,8 +60,7 @@ public class TestICUNormalizer2CharFilte
CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE));
- Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
- tokenStream.setReader(reader);
+ Tokenizer tokenStream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(tokenStream,
new String[] {"°C", "No", "(æ ª)", "ã°ã©ã ", "ã¶", "ã¾", "ãã´"},
@@ -77,8 +76,7 @@ public class TestICUNormalizer2CharFilte
CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
- Tokenizer tokenStream = new NGramTokenizer(TEST_VERSION_CURRENT, 1, 1);
- tokenStream.setReader(reader);
+ Tokenizer tokenStream = new NGramTokenizer(TEST_VERSION_CURRENT, reader, 1, 1);
assertTokenStreamContents(tokenStream,
new String[] {"ã", "ã´", "5", "°", "c", "n", "o", "(", "æ ª", ")", "ã°", "ã©", "ã ", "ã¶", "ã¾"},
@@ -94,8 +92,7 @@ public class TestICUNormalizer2CharFilte
CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
- Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
- tokenStream.setReader(reader);
+ Tokenizer tokenStream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(tokenStream,
new String[] {"صÙÙ", "اÙÙÙ", "عÙÙÙ", "ÙسÙÙ
"},
@@ -108,8 +105,8 @@ public class TestICUNormalizer2CharFilte
public void doTestMode(final Normalizer2 normalizer, int maxLength, int iterations) throws IOException {
Analyzer a = new Analyzer() {
@Override
- protected TokenStreamComponents createComponents(String fieldName) {
- return new TokenStreamComponents(new MockTokenizer(MockTokenizer.KEYWORD, false));
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ return new TokenStreamComponents(new MockTokenizer(reader, MockTokenizer.KEYWORD, false));
}
@Override
@@ -175,8 +172,8 @@ public class TestICUNormalizer2CharFilte
// nfkc_cf
Analyzer a = new Analyzer() {
@Override
- protected TokenStreamComponents createComponents(String fieldName) {
- return new TokenStreamComponents(new MockTokenizer(MockTokenizer.WHITESPACE, false));
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ return new TokenStreamComponents(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false));
}
@Override
@@ -191,8 +188,8 @@ public class TestICUNormalizer2CharFilte
// nfkd
a = new Analyzer() {
@Override
- protected TokenStreamComponents createComponents(String fieldName) {
- return new TokenStreamComponents(new MockTokenizer(MockTokenizer.WHITESPACE, false));
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ return new TokenStreamComponents(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false));
}
@Override
@@ -209,8 +206,8 @@ public class TestICUNormalizer2CharFilte
String text = "\udb40\udc3d\uf273\ue960\u06c8\ud955\udc13\ub7fc\u0692 \u2089\u207b\u2073\u2075";
Analyzer a = new Analyzer() {
@Override
- protected TokenStreamComponents createComponents(String fieldName) {
- return new TokenStreamComponents(new MockTokenizer(MockTokenizer.WHITESPACE, false));
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ return new TokenStreamComponents(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false));
}
@Override