You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by ry...@apache.org on 2008/10/18 20:38:24 UTC
svn commit: r705903 - in /lucene/solr/trunk: ./ example/solr/conf/
src/java/org/apache/solr/analysis/ src/test/org/apache/solr/analysis/
Author: ryan
Date: Sat Oct 18 11:38:24 2008
New Revision: 705903
URL: http://svn.apache.org/viewvc?rev=705903&view=rev
Log:
SOLR-813: Adding DoubleMetaphone Filter and Factory
Added:
lucene/solr/trunk/src/java/org/apache/solr/analysis/DoubleMetaphoneFilter.java
lucene/solr/trunk/src/java/org/apache/solr/analysis/DoubleMetaphoneFilterFactory.java
lucene/solr/trunk/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterFactoryTest.java
lucene/solr/trunk/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterTest.java
Modified:
lucene/solr/trunk/CHANGES.txt
lucene/solr/trunk/example/solr/conf/schema.xml
Modified: lucene/solr/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/CHANGES.txt?rev=705903&r1=705902&r2=705903&view=diff
==============================================================================
--- lucene/solr/trunk/CHANGES.txt (original)
+++ lucene/solr/trunk/CHANGES.txt Sat Oct 18 11:38:24 2008
@@ -43,7 +43,7 @@
See http://lucene.apache.org/java/2_3_2/api/org/apache/lucene/index/IndexDeletionPolicy.html
(yonik, Noble Paul, Akshay Ukey via shalin)
- 3. SOLR-657: Replace deprecated calls with the non-deprecated equivalents
+ 3. SOLR-657: Replace many deprecated calls with non-deprecated equivalents
(Lars Kotthoff via ryan)
4. SOLR-658: Allow Solr to load index from arbitrary directory in dataDir
@@ -56,6 +56,10 @@
6. SOLR-670: Add support for rollbacks in UpdateHandler. This allows user to rollback all changes
since the last commit. (Noble Paul, koji via shalin)
+ 7. SOLR-813: Adding DoubleMetaphone Filter and Factory. Similar to the PhoneticFilter,
+ but this uses DoubleMetaphone specific calls (including alternate encoding)
+ (Todd Feak via ryan)
+
Optimizations
----------------------
Modified: lucene/solr/trunk/example/solr/conf/schema.xml
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/example/solr/conf/schema.xml?rev=705903&r1=705902&r2=705903&view=diff
==============================================================================
--- lucene/solr/trunk/example/solr/conf/schema.xml (original)
+++ lucene/solr/trunk/example/solr/conf/schema.xml Sat Oct 18 11:38:24 2008
@@ -247,6 +247,14 @@
/>
</analyzer>
</fieldType>
+
+ <fieldtype name="phonetic" stored="false" indexed="true" class="solr.TextField" >
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/>
+ </analyzer>
+ </fieldtype>
+
<!-- since fields of this type are by default not stored or indexed, any data added to
them will be ignored outright
Added: lucene/solr/trunk/src/java/org/apache/solr/analysis/DoubleMetaphoneFilter.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/DoubleMetaphoneFilter.java?rev=705903&view=auto
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/DoubleMetaphoneFilter.java (added)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/DoubleMetaphoneFilter.java Sat Oct 18 11:38:24 2008
@@ -0,0 +1,90 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.util.LinkedList;
+
+import org.apache.commons.codec.language.DoubleMetaphone;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+public class DoubleMetaphoneFilter extends TokenFilter {
+
+ private static final String TOKEN_TYPE = "DoubleMetaphone";
+
+ private final LinkedList<Token> remainingTokens = new LinkedList<Token>();
+ private final DoubleMetaphone encoder = new DoubleMetaphone();
+ private final boolean inject;
+
+ protected DoubleMetaphoneFilter(TokenStream input, int maxCodeLength, boolean inject) {
+ super(input);
+ this.encoder.setMaxCodeLen(maxCodeLength);
+ this.inject = inject;
+ }
+
+ @Override
+ public final Token next(Token in) throws IOException {
+ if (!remainingTokens.isEmpty()) {
+ return remainingTokens.removeFirst();
+ }
+
+ Token t = input.next(in);
+ if (t != null) {
+ if (inject) {
+ remainingTokens.addLast(t);
+ }
+
+ boolean isPhonetic = false;
+ String v = new String(t.termBuffer(), 0, t.termLength());
+ String primaryPhoneticValue = encoder.doubleMetaphone(v);
+ if (primaryPhoneticValue.length() > 0) {
+ Token token = (Token) t.clone();
+ if( inject ) {
+ token.setPositionIncrement( 0 );
+ }
+ token.setType( TOKEN_TYPE );
+ token.setTermBuffer(primaryPhoneticValue);
+ remainingTokens.addLast(token);
+ isPhonetic = true;
+ }
+
+ String alternatePhoneticValue = encoder.doubleMetaphone(v, true);
+ if (alternatePhoneticValue.length() > 0
+ && !primaryPhoneticValue.equals(alternatePhoneticValue)) {
+ Token token = (Token) t.clone();
+ token.setPositionIncrement( 0 );
+ token.setType( TOKEN_TYPE );
+ token.setTermBuffer(alternatePhoneticValue);
+ remainingTokens.addLast(token);
+ isPhonetic = true;
+ }
+
+ // If we did not add something, then go to the next one...
+ if( !isPhonetic ) {
+ t = next(in);
+ if( t != null ) {
+ t.setPositionIncrement( t.getPositionIncrement()+1 );
+ }
+ return t;
+ }
+ }
+
+ return remainingTokens.isEmpty() ? null : remainingTokens.removeFirst();
+ }
+}
Added: lucene/solr/trunk/src/java/org/apache/solr/analysis/DoubleMetaphoneFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/DoubleMetaphoneFilterFactory.java?rev=705903&view=auto
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/DoubleMetaphoneFilterFactory.java (added)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/DoubleMetaphoneFilterFactory.java Sat Oct 18 11:38:24 2008
@@ -0,0 +1,49 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.analysis;
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+
+public class DoubleMetaphoneFilterFactory extends BaseTokenFilterFactory
+{
+ public static final String INJECT = "inject";
+ public static final String MAX_CODE_LENGTH = "maxCodeLength";
+
+ public static final int DEFAULT_MAX_CODE_LENGTH = 4;
+
+ private boolean inject = true;
+ private int maxCodeLength = DEFAULT_MAX_CODE_LENGTH;
+
+ @Override
+ public void init(Map<String, String> args) {
+ super.init(args);
+
+ if (args.get(INJECT) != null) {
+ inject = Boolean.getBoolean(args.get(INJECT));
+ }
+
+ if (args.get(MAX_CODE_LENGTH) != null) {
+ maxCodeLength = Integer.parseInt(args.get(MAX_CODE_LENGTH));
+ }
+ }
+
+ public DoubleMetaphoneFilter create(TokenStream input) {
+ return new DoubleMetaphoneFilter(input, maxCodeLength, inject);
+ }
+}
Added: lucene/solr/trunk/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterFactoryTest.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterFactoryTest.java?rev=705903&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterFactoryTest.java (added)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterFactoryTest.java Sat Oct 18 11:38:24 2008
@@ -0,0 +1,71 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.analysis;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.solr.analysis.BaseTokenTestCase.IterTokenStream;
+
+public class DoubleMetaphoneFilterFactoryTest extends TestCase {
+
+ public void testDefaults() throws Exception {
+ DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory();
+ factory.init(new HashMap<String, String>());
+ TokenStream inputStream = new IterTokenStream("international");
+
+ TokenStream filteredStream = factory.create(inputStream);
+
+ assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
+
+ Token token = filteredStream.next(new Token());
+ assertEquals(13, token.termLength());
+ assertEquals("international", new String(token.termBuffer(), 0, token
+ .termLength()));
+
+ token = filteredStream.next(new Token());
+ assertEquals(4, token.termLength());
+ assertEquals("ANTR", new String(token.termBuffer(), 0, token.termLength()));
+
+ assertNull(filteredStream.next(new Token()));
+ }
+
+ public void testSettingSizeAndInject() throws Exception {
+ DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory();
+ Map<String, String> parameters = new HashMap<String, String>();
+ parameters.put("inject", "false");
+ parameters.put("maxCodeLength", "8");
+ factory.init(parameters);
+
+ TokenStream inputStream = new IterTokenStream("international");
+
+ TokenStream filteredStream = factory.create(inputStream);
+
+ assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
+
+ Token token = filteredStream.next(new Token());
+ assertEquals(8, token.termLength());
+ assertEquals("ANTRNXNL", new String(token.termBuffer(), 0, token
+ .termLength()));
+
+ assertNull(filteredStream.next(new Token()));
+ }
+}
Added: lucene/solr/trunk/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterTest.java?rev=705903&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterTest.java (added)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterTest.java Sat Oct 18 11:38:24 2008
@@ -0,0 +1,109 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.analysis;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.solr.analysis.BaseTokenTestCase.IterTokenStream;
+
+public class DoubleMetaphoneFilterTest extends TestCase {
+
+ public void testSize4FalseInject() throws Exception {
+ TokenStream stream = new IterTokenStream("international");
+ TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
+
+ Token token = filter.next(new Token());
+ assertEquals(4, token.termLength());
+ assertEquals("ANTR", new String(token.termBuffer(), 0, token.termLength()));
+
+ assertNull(filter.next(new Token()));
+ }
+
+ public void testSize4TrueInject() throws Exception {
+ TokenStream stream = new IterTokenStream("international");
+ TokenStream filter = new DoubleMetaphoneFilter(stream, 4, true);
+
+ Token token = filter.next(new Token());
+ assertEquals(13, token.termLength());
+ assertEquals("international", new String(token.termBuffer(), 0, token
+ .termLength()));
+
+ token = filter.next(new Token());
+ assertEquals(4, token.termLength());
+ assertEquals("ANTR", new String(token.termBuffer(), 0, token.termLength()));
+
+ assertNull(filter.next(new Token()));
+ }
+
+ public void testAlternateInjectFalse() throws Exception {
+ TokenStream stream = new IterTokenStream("Kuczewski");
+ TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
+
+ Token token = filter.next(new Token());
+ assertEquals(4, token.termLength());
+ assertEquals("KSSK", new String(token.termBuffer(), 0, token.termLength()));
+
+ token = filter.next(new Token());
+ assertEquals(4, token.termLength());
+ assertEquals("KXFS", new String(token.termBuffer(), 0, token.termLength()));
+ assertNull(filter.next(new Token()));
+ }
+
+ public void testSize8FalseInject() throws Exception {
+ TokenStream stream = new IterTokenStream("international");
+ TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
+
+ Token token = filter.next(new Token());
+ assertEquals(8, token.termLength());
+ assertEquals("ANTRNXNL", new String(token.termBuffer(), 0, token
+ .termLength()));
+
+ assertNull(filter.next(new Token()));
+ }
+
+ public void testNonConvertableStringsWithInject() throws Exception {
+ TokenStream stream = new IterTokenStream(
+ new String[] { "12345", "#$%@#^%&" });
+ TokenStream filter = new DoubleMetaphoneFilter(stream, 8, true);
+
+ Token token = filter.next(new Token());
+ assertEquals(5, token.termLength());
+ assertEquals("12345", new String(token.termBuffer(), 0, token.termLength()));
+
+ token = filter.next(new Token());
+ assertEquals(8, token.termLength());
+ assertEquals("#$%@#^%&", new String(token.termBuffer(), 0, token
+ .termLength()));
+ }
+
+ public void testNonConvertableStringsWithoutInject() throws Exception {
+ TokenStream stream = new IterTokenStream(
+ new String[] { "12345", "#$%@#^%&" });
+ TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
+
+ assertNull(filter.next(new Token()));
+
+ // should have something after the stream
+ stream = new IterTokenStream(
+ new String[] { "12345", "#$%@#^%&", "hello" });
+ filter = new DoubleMetaphoneFilter(stream, 8, false);
+ assertNotNull(filter.next(new Token()));
+ }
+
+}