You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2014/11/11 19:47:15 UTC
svn commit: r1638250 - in /lucene/dev/trunk: lucene/
lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/
lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/
lucene/licenses/ solr/licenses/
Author: mikemccand
Date: Tue Nov 11 18:47:14 2014
New Revision: 1638250
URL: http://svn.apache.org/r1638250
Log:
LUCENE-6059: add Daitch-Mokotoff Soundex phonetic Apache commons phonetic codec
Added:
lucene/dev/trunk/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DaitchMokotoffSoundexFilter.java (with props)
lucene/dev/trunk/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DaitchMokotoffSoundexFilterFactory.java (with props)
lucene/dev/trunk/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestDaitchMokotoffSoundexFilter.java (with props)
lucene/dev/trunk/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestDaitchMokotoffSoundexFilterFactory.java (with props)
lucene/dev/trunk/lucene/licenses/commons-codec-1.10.jar.sha1 (with props)
lucene/dev/trunk/solr/licenses/commons-codec-1.10.jar.sha1 (with props)
Removed:
lucene/dev/trunk/lucene/licenses/commons-codec-1.9.jar.sha1
lucene/dev/trunk/solr/licenses/commons-codec-1.9.jar.sha1
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestBeiderMorseFilterFactory.java
lucene/dev/trunk/lucene/ivy-versions.properties
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1638250&r1=1638249&r2=1638250&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Tue Nov 11 18:47:14 2014
@@ -82,6 +82,10 @@ New Features
* LUCENE-4400: Add support for new NYSIIS Apache commons phonetic
codec (Thomas Neidhart via Mike McCandless)
+* LUCENE-6059: Add Daitch-Mokotoff Soundex phonetic Apache commons
+ phonetic codec, and upgrade to Apache commons codec 1.10 (Thomas
+ Neidhart via Mike McCandless)
+
API Changes
* LUCENE-5900: Deprecated more constructors taking Version in *InfixSuggester and
Added: lucene/dev/trunk/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DaitchMokotoffSoundexFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DaitchMokotoffSoundexFilter.java?rev=1638250&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DaitchMokotoffSoundexFilter.java (added)
+++ lucene/dev/trunk/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DaitchMokotoffSoundexFilter.java Tue Nov 11 18:47:14 2014
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.phonetic;
+
+import java.io.IOException;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.codec.language.DaitchMokotoffSoundex;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+
+/**
+ * Create tokens for phonetic matches based on DaitchâMokotoff Soundex.
+ *
+ * @lucene.experimental
+ */
+public final class DaitchMokotoffSoundexFilter extends TokenFilter {
+ /** true if encoded tokens should be added as synonyms */
+ protected boolean inject = true;
+ /** phonetic encoder */
+ protected DaitchMokotoffSoundex encoder = new DaitchMokotoffSoundex();
+
+ // output is a string such as ab|ac|...
+ private static final Pattern pattern = Pattern.compile("([^|]+)");
+
+ // matcher over any buffered output
+ private final Matcher matcher = pattern.matcher("");
+
+ // encoded representation
+ private String encoded;
+ // preserves all attributes for any buffered outputs
+ private State state;
+
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
+
+ /**
+ * Creates a DaitchMokotoffSoundexFilter by either adding encoded forms as synonyms (
+ * <code>inject=true</code>) or replacing them.
+ */
+ public DaitchMokotoffSoundexFilter(TokenStream in, boolean inject) {
+ super(in);
+ this.inject = inject;
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (matcher.find()) {
+ assert state != null && encoded != null;
+ restoreState(state);
+ termAtt.setEmpty().append(encoded, matcher.start(1), matcher.end(1));
+ posAtt.setPositionIncrement(0);
+ return true;
+ }
+
+ if (input.incrementToken()) {
+ // pass through zero-length terms
+ if (termAtt.length() == 0) {
+ return true;
+ }
+
+ encoded = encoder.soundex(termAtt.toString());
+ state = captureState();
+ matcher.reset(encoded);
+
+ if (!inject) {
+ if (matcher.find()) {
+ termAtt.setEmpty().append(encoded, matcher.start(1), matcher.end(1));
+ }
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ matcher.reset("");
+ state = null;
+ }
+}
Added: lucene/dev/trunk/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DaitchMokotoffSoundexFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DaitchMokotoffSoundexFilterFactory.java?rev=1638250&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DaitchMokotoffSoundexFilterFactory.java (added)
+++ lucene/dev/trunk/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DaitchMokotoffSoundexFilterFactory.java Tue Nov 11 18:47:14 2014
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.phonetic;
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * Factory for {@link DaitchMokotoffSoundexFilter}.
+ *
+ * Create tokens based on DaitchâMokotoff Soundex phonetic filter.
+ * <p>
+ * This takes one optional argument:
+ * <dl>
+ * <dt>inject</dt><dd> (default=true) add tokens to the stream with the offset=0</dd>
+ * </dl>
+ *
+ * <pre class="prettyprint">
+ * <fieldType name="text_phonetic" class="solr.TextField" positionIncrementGap="100">
+ * <analyzer>
+ * <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ * <filter class="solr.DaitchMokotoffSoundexFilterFactory" inject="true"/>
+ * </analyzer>
+ * </fieldType></pre>
+ *
+ * @see DaitchMokotoffSoundexFilter
+ *
+ * @lucene.experimental
+ */
+public class DaitchMokotoffSoundexFilterFactory extends TokenFilterFactory {
+ /** parameter name: true if encoded tokens should be added as synonyms */
+ public static final String INJECT = "inject"; // boolean
+
+ final boolean inject; //accessed by the test
+
+ /** Creates a new PhoneticFilterFactory */
+ public DaitchMokotoffSoundexFilterFactory(Map<String,String> args) {
+ super(args);
+ inject = getBoolean(args, INJECT, true);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ @Override
+ public DaitchMokotoffSoundexFilter create(TokenStream input) {
+ return new DaitchMokotoffSoundexFilter(input, inject);
+ }
+
+}
Modified: lucene/dev/trunk/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestBeiderMorseFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestBeiderMorseFilterFactory.java?rev=1638250&r1=1638249&r2=1638250&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestBeiderMorseFilterFactory.java (original)
+++ lucene/dev/trunk/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestBeiderMorseFilterFactory.java Tue Nov 11 18:47:14 2014
@@ -17,12 +17,10 @@ package org.apache.lucene.analysis.phone
* limitations under the License.
*/
-import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
/** Simple tests for {@link BeiderMorseFilterFactory} */
@@ -31,10 +29,10 @@ public class TestBeiderMorseFilterFactor
BeiderMorseFilterFactory factory = new BeiderMorseFilterFactory(new HashMap<String,String>());
TokenStream ts = factory.create(whitespaceMockTokenizer("Weinberg"));
assertTokenStreamContents(ts,
- new String[] { "vDnbirk", "vanbirk", "vinbirk", "wDnbirk", "wanbirk", "winbirk" },
- new int[] { 0, 0, 0, 0, 0, 0 },
- new int[] { 8, 8, 8, 8, 8, 8 },
- new int[] { 1, 0, 0, 0, 0, 0 });
+ new String[] { "vDnbYrk", "vDnbirk", "vanbYrk", "vanbirk", "vinbYrk", "vinbirk", "wDnbirk", "wanbirk", "winbirk" },
+ new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ new int[] { 8, 8, 8, 8, 8, 8, 8, 8, 8},
+ new int[] { 1, 0, 0, 0, 0, 0, 0, 0, 0});
}
public void testLanguageSet() throws Exception {
Added: lucene/dev/trunk/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestDaitchMokotoffSoundexFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestDaitchMokotoffSoundexFilter.java?rev=1638250&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestDaitchMokotoffSoundexFilter.java (added)
+++ lucene/dev/trunk/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestDaitchMokotoffSoundexFilter.java Tue Nov 11 18:47:14 2014
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.phonetic;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+
+/**
+ * Tests {@link DaitchMokotoffSoundexFilter}
+ */
+public class TestDaitchMokotoffSoundexFilter extends BaseTokenStreamTestCase {
+
+ public void testAlgorithms() throws Exception {
+ assertAlgorithm(true, "aaa bbb ccc easgasg",
+ new String[] { "aaa", "000000", "bbb", "700000", "ccc", "400000", "450000", "454000",
+ "540000", "545000", "500000", "easgasg", "045450" });
+ assertAlgorithm(false, "aaa bbb ccc easgasg",
+ new String[] { "000000", "700000", "400000", "450000", "454000", "540000", "545000",
+ "500000", "045450" });
+ }
+
+ static void assertAlgorithm(boolean inject, String input, String[] expected) throws Exception {
+ Tokenizer tokenizer = new WhitespaceTokenizer();
+ tokenizer.setReader(new StringReader(input));
+ DaitchMokotoffSoundexFilter filter = new DaitchMokotoffSoundexFilter(tokenizer, inject);
+ assertTokenStreamContents(filter, expected);
+ }
+
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws IOException {
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer, new DaitchMokotoffSoundexFilter(tokenizer, false));
+ }
+ };
+
+ checkRandomData(random(), a, 1000 * RANDOM_MULTIPLIER);
+
+ Analyzer b = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer, new DaitchMokotoffSoundexFilter(tokenizer, false));
+ }
+ };
+
+ checkRandomData(random(), b, 1000 * RANDOM_MULTIPLIER);
+ }
+
+ public void testEmptyTerm() throws IOException {
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new KeywordTokenizer();
+ return new TokenStreamComponents(tokenizer, new DaitchMokotoffSoundexFilter(tokenizer, random().nextBoolean()));
+ }
+ };
+ checkOneTerm(a, "", "");
+ }
+
+}
Added: lucene/dev/trunk/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestDaitchMokotoffSoundexFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestDaitchMokotoffSoundexFilterFactory.java?rev=1638250&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestDaitchMokotoffSoundexFilterFactory.java (added)
+++ lucene/dev/trunk/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestDaitchMokotoffSoundexFilterFactory.java Tue Nov 11 18:47:14 2014
@@ -0,0 +1,65 @@
+package org.apache.lucene.analysis.phonetic;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+
+public class TestDaitchMokotoffSoundexFilterFactory extends BaseTokenStreamTestCase {
+
+ public void testDefaults() throws Exception {
+ DaitchMokotoffSoundexFilterFactory factory = new DaitchMokotoffSoundexFilterFactory(new HashMap<String, String>());
+ Tokenizer inputStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ inputStream.setReader(new StringReader("international"));
+
+ TokenStream filteredStream = factory.create(inputStream);
+ assertEquals(DaitchMokotoffSoundexFilter.class, filteredStream.getClass());
+ assertTokenStreamContents(filteredStream, new String[] { "international", "063963" });
+ }
+
+ public void testSettingInject() throws Exception {
+ Map<String,String> parameters = new HashMap<>();
+ parameters.put("inject", "false");
+ DaitchMokotoffSoundexFilterFactory factory = new DaitchMokotoffSoundexFilterFactory(parameters);
+
+ Tokenizer inputStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ inputStream.setReader(new StringReader("international"));
+
+ TokenStream filteredStream = factory.create(inputStream);
+ assertEquals(DaitchMokotoffSoundexFilter.class, filteredStream.getClass());
+ assertTokenStreamContents(filteredStream, new String[] { "063963" });
+ }
+
+ /** Test that bogus arguments result in exception */
+ public void testBogusArguments() throws Exception {
+ try {
+ new DaitchMokotoffSoundexFilterFactory(new HashMap<String,String>() {{
+ put("bogusArg", "bogusValue");
+ }});
+ fail();
+ } catch (IllegalArgumentException expected) {
+ assertTrue(expected.getMessage().contains("Unknown parameters"));
+ }
+ }
+}
Modified: lucene/dev/trunk/lucene/ivy-versions.properties
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/ivy-versions.properties?rev=1638250&r1=1638249&r2=1638250&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/ivy-versions.properties (original)
+++ lucene/dev/trunk/lucene/ivy-versions.properties Tue Nov 11 18:47:14 2014
@@ -55,7 +55,7 @@ com.sun.jersey.version = 1.9
/com.uwyn/jhighlight = 1.0
/commons-beanutils/commons-beanutils = 1.8.3
/commons-cli/commons-cli = 1.2
-/commons-codec/commons-codec = 1.9
+/commons-codec/commons-codec = 1.10
/commons-collections/commons-collections = 3.2.1
/commons-configuration/commons-configuration = 1.6
/commons-digester/commons-digester = 2.1
Added: lucene/dev/trunk/lucene/licenses/commons-codec-1.10.jar.sha1
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/licenses/commons-codec-1.10.jar.sha1?rev=1638250&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/licenses/commons-codec-1.10.jar.sha1 (added)
+++ lucene/dev/trunk/lucene/licenses/commons-codec-1.10.jar.sha1 Tue Nov 11 18:47:14 2014
@@ -0,0 +1 @@
+4b95f4897fa13f2cd904aee711aeafc0c5295cd8
Added: lucene/dev/trunk/solr/licenses/commons-codec-1.10.jar.sha1
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/licenses/commons-codec-1.10.jar.sha1?rev=1638250&view=auto
==============================================================================
--- lucene/dev/trunk/solr/licenses/commons-codec-1.10.jar.sha1 (added)
+++ lucene/dev/trunk/solr/licenses/commons-codec-1.10.jar.sha1 Tue Nov 11 18:47:14 2014
@@ -0,0 +1 @@
+4b95f4897fa13f2cd904aee711aeafc0c5295cd8