You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mh...@apache.org on 2015/08/27 15:11:06 UTC
svn commit: r1698145 - in /lucene/dev/trunk/lucene: ./
analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/
analysis/common/src/resources/META-INF/services/
analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/
Author: mharwood
Date: Thu Aug 27 13:11:06 2015
New Revision: 1698145
URL: http://svn.apache.org/r1698145
Log:
LUCENE-6747: FingerprintFilter is a new TokenFilter that outputs a single token which is a concatenation of the sorted and de-duplicated set of input tokens.
Added:
lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java (with props)
lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilterFactory.java (with props)
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilter.java (with props)
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilterFactory.java (with props)
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1698145&r1=1698144&r2=1698145&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Thu Aug 27 13:11:06 2015
@@ -14,6 +14,10 @@ System Requirements
all other modules with "compact2". (Robert Muir, Uwe Schindler)
New Features
+* LUCENE-6747: FingerprintFilter is a TokenFilter that outputs a single
+ token which is a concatenation of the sorted and de-duplicated set of
+ input tokens. Useful for normalizing short text in clustering/linking
+ tasks. (Mark Harwood, Adrien Grand)
* LUCENE-5735: NumberRangePrefixTreeStrategy now includes interval/range faceting
for counting ranges that align with the underlying terms as defined by the
Added: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java?rev=1698145&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java (added)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java Thu Aug 27 13:11:06 2015
@@ -0,0 +1,217 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Comparator;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.util.AttributeSource;
+
+/**
+ * Filter outputs a single token which is a concatenation of the sorted and
+ * de-duplicated set of input tokens. This can be useful for clustering/linking
+ * use cases.
+ */
+public class FingerprintFilter extends TokenFilter {
+
+ public static final int DEFAULT_MAX_OUTPUT_TOKEN_SIZE = 1024;
+ public static final char DEFAULT_SEPARATOR = ' ';
+ private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
+ private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+ private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+ private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
+ private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+
+ private CharArraySet uniqueTerms = null;
+ private final int maxOutputTokenSize;
+ private AttributeSource.State finalState;
+
+ private final char separator;
+ private boolean inputEnded = false;
+
+
+ /**
+ * Create a new FingerprintFilter with default settings
+ */
+ public FingerprintFilter(TokenStream input) {
+ this(input, DEFAULT_MAX_OUTPUT_TOKEN_SIZE, DEFAULT_SEPARATOR);
+ }
+
+ /**
+ * Create a new FingerprintFilter with control over all settings
+ *
+ * @param input
+ * the source of tokens to be summarized into a single token
+ * @param maxOutputTokenSize
+ * the maximum length of the summarized output token. If exceeded, no
+ * output token is emitted
+ * @param separator
+ * the character used to separate tokens combined into the single
+ * output token
+ */
+ public FingerprintFilter(TokenStream input, int maxOutputTokenSize,
+ char separator) {
+ super(input);
+ this.maxOutputTokenSize = maxOutputTokenSize;
+ this.separator = separator;
+ }
+
+ @Override
+ public final boolean incrementToken() throws IOException {
+ if (uniqueTerms != null) {
+ // We have already built the single output token - there's no more
+ return false;
+ }
+ boolean result = buildSingleOutputToken();
+ finalState = captureState();
+ return result;
+ }
+
+ /**
+ * Gathers all tokens from input, de-duplicates, sorts then concatenates.
+ *
+ * @return false for end of stream; true otherwise
+ */
+ private final boolean buildSingleOutputToken() throws IOException {
+ inputEnded = false;
+
+ char clonedLastTerm[] = null;
+ uniqueTerms = new CharArraySet(8, false);
+ int outputTokenSize = 0;
+ while (input.incrementToken()) {
+ if (outputTokenSize > maxOutputTokenSize) {
+ continue;
+ }
+
+ final char term[] = termAttribute.buffer();
+ final int length = termAttribute.length();
+
+ if (!uniqueTerms.contains(term, 0, length)) {
+ // clone the term, and add to the set of seen terms.
+ clonedLastTerm = new char[length];
+ System.arraycopy(term, 0, clonedLastTerm, 0, length);
+ if (uniqueTerms.size() > 0) {
+ outputTokenSize++; //Add 1 for the separator char we will output
+ }
+ uniqueTerms.add(clonedLastTerm);
+ outputTokenSize += length;
+ }
+ }
+ //Force end-of-stream operations to get the final state.
+ input.end();
+ inputEnded = true;
+
+ //Gathering complete - now output exactly zero or one token:
+
+ //Set the attributes for the single output token
+ offsetAtt.setOffset(0, offsetAtt.endOffset());
+ posLenAtt.setPositionLength(1);
+ posIncrAtt.setPositionIncrement(1);
+ typeAtt.setType("fingerprint");
+
+ //No tokens gathered - no output
+ if (uniqueTerms.size() < 1) {
+ termAttribute.setEmpty();
+ return false;
+ }
+
+ //Tokens gathered are too large - no output
+ if (outputTokenSize > maxOutputTokenSize) {
+ termAttribute.setEmpty();
+ uniqueTerms.clear();
+ return false;
+ }
+
+ // Special case - faster option when we have a single token
+ if (uniqueTerms.size() == 1) {
+ termAttribute.setEmpty().append(new String(clonedLastTerm));
+ uniqueTerms.clear();
+ return true;
+ }
+
+ // Sort the set of deduplicated tokens and combine
+ Object[] items = uniqueTerms.toArray();
+
+ Arrays.sort(items, new Comparator<Object>() {
+ @Override
+ public int compare(Object o1, Object o2) {
+ char v1[] = (char[]) o1;
+ char v2[] = (char[]) o2;
+ int len1 = v1.length;
+ int len2 = v2.length;
+ int lim = Math.min(len1, len2);
+
+ int k = 0;
+ while (k < lim) {
+ char c1 = v1[k];
+ char c2 = v2[k];
+ if (c1 != c2) {
+ return c1 - c2;
+ }
+ k++;
+ }
+ return len1 - len2;
+ }
+ });
+
+ StringBuilder sb = new StringBuilder();
+ for (Object item : items) {
+ if (sb.length() >= 1) {
+ sb.append(separator);
+ }
+ sb.append((char[]) item);
+ }
+ termAttribute.setEmpty().append(sb);
+ uniqueTerms.clear();
+ return true;
+
+ }
+
+ @Override
+ public final void end() throws IOException {
+ if (!inputEnded) {
+ // Rare case - If an IOException occurs while performing buildSingleOutputToken
+ // we may not have called input.end() already
+ input.end();
+ inputEnded = true;
+ }
+
+ if (finalState != null) {
+ restoreState(finalState);
+ }
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ inputEnded = false;
+ uniqueTerms = null;
+ }
+
+}
Added: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilterFactory.java?rev=1698145&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilterFactory.java (added)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilterFactory.java Thu Aug 27 13:11:06 2015
@@ -0,0 +1,59 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * Factory for {@link FingerprintFilter}.
+ *
+ * <pre class="prettyprint">
+ * The {@code maxOutputTokenSize} property is optional and defaults to {@code 1024}.
+ * The {@code separator} property is optional and defaults to the space character.
+ * See
+ * {@link FingerprintFilter} for an explanation of its use.
+ * </pre>
+ */
+public class FingerprintFilterFactory extends TokenFilterFactory {
+
+ public static final String MAX_OUTPUT_TOKEN_SIZE_KEY = "maxOutputTokenSize";
+ public static final String SEPARATOR_KEY = "separator";
+ final int maxOutputTokenSize;
+ final char separator;
+
+ /** Creates a new FingerprintFilterFactory */
+ public FingerprintFilterFactory(Map<String, String> args) {
+ super(args);
+ maxOutputTokenSize = getInt(args, MAX_OUTPUT_TOKEN_SIZE_KEY,
+ FingerprintFilter.DEFAULT_MAX_OUTPUT_TOKEN_SIZE);
+ separator = getChar(args, SEPARATOR_KEY,
+ FingerprintFilter.DEFAULT_SEPARATOR);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ @Override
+ public TokenStream create(TokenStream input) {
+ return new FingerprintFilter(input, maxOutputTokenSize, separator);
+ }
+
+}
Modified: lucene/dev/trunk/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory?rev=1698145&r1=1698144&r2=1698145&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory Thu Aug 27 13:11:06 2015
@@ -61,6 +61,7 @@ org.apache.lucene.analysis.lv.LatvianSte
org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory
org.apache.lucene.analysis.miscellaneous.CapitalizationFilterFactory
org.apache.lucene.analysis.miscellaneous.CodepointCountFilterFactory
+org.apache.lucene.analysis.miscellaneous.FingerprintFilterFactory
org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilterFactory
org.apache.lucene.analysis.miscellaneous.KeepWordFilterFactory
org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilterFactory
Added: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilter.java?rev=1698145&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilter.java (added)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilter.java Thu Aug 27 13:11:06 2015
@@ -0,0 +1,72 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+
+public class TestFingerprintFilter extends BaseTokenStreamTestCase {
+
+ public void testDupsAndSorting() throws Exception {
+ for (final boolean consumeAll : new boolean[] { true, false }) {
+ MockTokenizer tokenizer = whitespaceMockTokenizer("B A B E");
+ tokenizer.setEnableChecks(consumeAll);
+ TokenStream stream = new FingerprintFilter(tokenizer);
+ assertTokenStreamContents(stream, new String[] { "A B E" });
+ }
+ }
+
+ public void testAllDupValues() throws Exception {
+ for (final boolean consumeAll : new boolean[] { true, false }) {
+ MockTokenizer tokenizer = whitespaceMockTokenizer("B2 B2");
+ tokenizer.setEnableChecks(consumeAll);
+ TokenStream stream = new FingerprintFilter(tokenizer);
+ assertTokenStreamContents(stream, new String[] { "B2" });
+ }
+ }
+
+ public void testMaxFingerprintSize() throws Exception {
+ for (final boolean consumeAll : new boolean[] { true, false }) {
+ MockTokenizer tokenizer = whitespaceMockTokenizer("B2 A1 C3 D4 E5 F6 G7 H1");
+ tokenizer.setEnableChecks(consumeAll);
+ TokenStream stream = new FingerprintFilter(tokenizer, 4, ' ');
+ assertTokenStreamContents(stream, new String[] {});
+ }
+ }
+
+ public void testCustomSeparator() throws Exception {
+ for (final boolean consumeAll : new boolean[] { true, false }) {
+ MockTokenizer tokenizer = whitespaceMockTokenizer("B2 A1 C3 B2");
+ tokenizer.setEnableChecks(consumeAll);
+ TokenStream stream = new FingerprintFilter(tokenizer,
+ FingerprintFilter.DEFAULT_MAX_OUTPUT_TOKEN_SIZE, '_');
+ assertTokenStreamContents(stream, new String[] { "A1_B2_C3" });
+ }
+ }
+
+ public void testSingleToken() throws Exception {
+ for (final boolean consumeAll : new boolean[] { true, false }) {
+ MockTokenizer tokenizer = whitespaceMockTokenizer("A1");
+ tokenizer.setEnableChecks(consumeAll);
+ TokenStream stream = new FingerprintFilter(tokenizer);
+ assertTokenStreamContents(stream, new String[] { "A1" });
+ }
+ }
+
+}
Added: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilterFactory.java?rev=1698145&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilterFactory.java (added)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilterFactory.java Thu Aug 27 13:11:06 2015
@@ -0,0 +1,62 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
+
+public class TestFingerprintFilterFactory extends BaseTokenStreamFactoryTestCase {
+
+ public void test() throws Exception {
+ for (final boolean consumeAll : new boolean[]{true, false}) {
+ Reader reader = new StringReader("A1 B2 A1 D4 C3");
+ MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ tokenizer.setReader(reader);
+ tokenizer.setEnableChecks(consumeAll);
+ TokenStream stream = tokenizer;
+ stream = tokenFilterFactory("Fingerprint",
+ FingerprintFilterFactory.MAX_OUTPUT_TOKEN_SIZE_KEY, "256",
+ FingerprintFilterFactory.SEPARATOR_KEY, "_"
+ ).create(stream);
+ assertTokenStreamContents(stream, new String[]{"A1_B2_C3_D4"});
+ }
+ }
+
+ public void testRequired() throws Exception {
+ // no params are required
+ tokenFilterFactory("Fingerprint");
+ }
+
+ /**
+ * Test that bogus arguments result in exception
+ */
+ public void testBogusArguments() throws Exception {
+ try {
+ tokenFilterFactory("Fingerprint",
+ FingerprintFilterFactory.MAX_OUTPUT_TOKEN_SIZE_KEY, "3",
+ "bogusArg", "bogusValue");
+ fail();
+ } catch (IllegalArgumentException expected) {
+ assertTrue(expected.getMessage().contains("Unknown parameters"));
+ }
+ }
+}