You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mh...@apache.org on 2015/08/27 15:11:06 UTC
svn commit: r1698145 - in /lucene/dev/trunk/lucene: ./ analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ analysis/common/src/resources/META-INF/services/ analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/

Author: mharwood
Date: Thu Aug 27 13:11:06 2015
New Revision: 1698145

URL: http://svn.apache.org/r1698145
Log:
LUCENE-6747: FingerprintFilter is a new TokenFilter that outputs a single token which is a concatenation of the sorted and de-duplicated set of input tokens.

Added:
    lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java   (with props)
    lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilterFactory.java   (with props)
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilter.java   (with props)
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilterFactory.java   (with props)
Modified:
    lucene/dev/trunk/lucene/CHANGES.txt
    lucene/dev/trunk/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory

Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1698145&r1=1698144&r2=1698145&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Thu Aug 27 13:11:06 2015
@@ -14,6 +14,10 @@ System Requirements
   all other modules with "compact2".  (Robert Muir, Uwe Schindler)
 
 New Features
+* LUCENE-6747: FingerprintFilter is a TokenFilter that outputs a single
+  token which is a concatenation of the sorted and de-duplicated set of 
+  input tokens. Useful for normalizing short text in clustering/linking 
+  tasks. (Mark Harwood, Adrien Grand)
 
 * LUCENE-5735: NumberRangePrefixTreeStrategy now includes interval/range faceting
   for counting ranges that align with the underlying terms as defined by the

Added: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java?rev=1698145&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java (added)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java Thu Aug 27 13:11:06 2015
@@ -0,0 +1,217 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Comparator;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.util.AttributeSource;
+
+/**
+ * Filter outputs a single token which is a concatenation of the sorted and
+ * de-duplicated set of input tokens. This can be useful for clustering/linking
+ * use cases.
+ */
+public class FingerprintFilter extends TokenFilter {
+
+  public static final int DEFAULT_MAX_OUTPUT_TOKEN_SIZE = 1024;
+  public static final char DEFAULT_SEPARATOR = ' ';
+  private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
+  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+  private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
+  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+
+  private CharArraySet uniqueTerms = null;
+  private final int maxOutputTokenSize;
+  private AttributeSource.State finalState;
+
+  private final char separator;
+  private boolean inputEnded = false;
+
+
+  /**
+   * Create a new FingerprintFilter with default settings
+   */
+  public FingerprintFilter(TokenStream input) {
+    this(input, DEFAULT_MAX_OUTPUT_TOKEN_SIZE, DEFAULT_SEPARATOR);
+  }
+
+  /**
+   * Create a new FingerprintFilter with control over all settings
+   * 
+   * @param input
+   *          the source of tokens to be summarized into a single token
+   * @param maxOutputTokenSize
+   *          the maximum length of the summarized output token. If exceeded, no
+   *          output token is emitted
+   * @param separator
+   *          the character used to separate tokens combined into the single
+   *          output token
+   */
+  public FingerprintFilter(TokenStream input, int maxOutputTokenSize,
+      char separator) {
+    super(input);
+    this.maxOutputTokenSize = maxOutputTokenSize;
+    this.separator = separator;
+  }
+
+  @Override
+  public final boolean incrementToken() throws IOException {
+    if (uniqueTerms != null) {
+      // We have already built the single output token - there's no more 
+      return false;
+    }
+    boolean result = buildSingleOutputToken();
+    finalState = captureState();
+    return result;
+  }
+
+  /**
+   * Gathers all tokens from input, de-duplicates, sorts then concatenates.
+   * 
+   * @return false for end of stream; true otherwise
+   */
+  private final boolean buildSingleOutputToken() throws IOException {
+    inputEnded = false;
+
+    char clonedLastTerm[] = null;
+    uniqueTerms = new CharArraySet(8, false);
+    int outputTokenSize = 0;
+    while (input.incrementToken()) {
+      if (outputTokenSize > maxOutputTokenSize) {
+        continue;
+      }
+
+      final char term[] = termAttribute.buffer();
+      final int length = termAttribute.length();
+
+      if (!uniqueTerms.contains(term, 0, length)) {
+        // clone the term, and add to the set of seen terms.
+        clonedLastTerm = new char[length];
+        System.arraycopy(term, 0, clonedLastTerm, 0, length);
+        if (uniqueTerms.size() > 0) {
+          outputTokenSize++; //Add 1 for the separator char we will output
+        }
+        uniqueTerms.add(clonedLastTerm);
+        outputTokenSize += length;
+      }
+    }
+    //Force end-of-stream operations to get the final state.
+    input.end();
+    inputEnded = true;
+
+    //Gathering complete - now output exactly zero or one token:
+
+    //Set the attributes for the single output token
+    offsetAtt.setOffset(0, offsetAtt.endOffset());
+    posLenAtt.setPositionLength(1);
+    posIncrAtt.setPositionIncrement(1);
+    typeAtt.setType("fingerprint");
+
+    //No tokens gathered - no output
+    if (uniqueTerms.size() < 1) {
+      termAttribute.setEmpty();
+      return false;
+    }
+
+    //Tokens gathered are too large - no output
+    if (outputTokenSize > maxOutputTokenSize) {
+      termAttribute.setEmpty();
+      uniqueTerms.clear();
+      return false;
+    }
+
+    // Special case - faster option when we have a single token
+    if (uniqueTerms.size() == 1) {
+      termAttribute.setEmpty().append(new String(clonedLastTerm));
+      uniqueTerms.clear();
+      return true;
+    }
+
+    // Sort the set of deduplicated tokens and combine 
+    Object[] items = uniqueTerms.toArray();
+
+    Arrays.sort(items, new Comparator<Object>() {
+      @Override
+      public int compare(Object o1, Object o2) {
+        char v1[] = (char[]) o1;
+        char v2[] = (char[]) o2;
+        int len1 = v1.length;
+        int len2 = v2.length;
+        int lim = Math.min(len1, len2);
+
+        int k = 0;
+        while (k < lim) {
+          char c1 = v1[k];
+          char c2 = v2[k];
+          if (c1 != c2) {
+            return c1 - c2;
+          }
+          k++;
+        }
+        return len1 - len2;
+      }
+    });
+
+    StringBuilder sb = new StringBuilder();
+    for (Object item : items) {
+      if (sb.length() >= 1) {
+        sb.append(separator);
+      }
+      sb.append((char[]) item);
+    }
+    termAttribute.setEmpty().append(sb);
+    uniqueTerms.clear();
+    return true;
+
+  }
+
+  @Override
+  public final void end() throws IOException {
+    if (!inputEnded) {
+      // Rare case - If an IOException occurs while performing buildSingleOutputToken
+      // we may not have called input.end() already
+      input.end();
+      inputEnded = true;
+    }
+
+    if (finalState != null) {
+      restoreState(finalState);
+    }
+  }
+
+  /**
+   * {@inheritDoc}
+   */
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    inputEnded = false;
+    uniqueTerms = null;
+  }
+
+}

Added: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilterFactory.java?rev=1698145&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilterFactory.java (added)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilterFactory.java Thu Aug 27 13:11:06 2015
@@ -0,0 +1,59 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * Factory for {@link FingerprintFilter}.
+ * 
+ * <pre class="prettyprint">
+ * The {@code maxOutputTokenSize} property is optional and defaults to {@code 1024}.  
+ * The {@code separator} property is optional and defaults to the space character.  
+ * See
+ * {@link FingerprintFilter} for an explanation of its use.
+ * </pre>
+ */
+public class FingerprintFilterFactory extends TokenFilterFactory {
+
+  public static final String MAX_OUTPUT_TOKEN_SIZE_KEY = "maxOutputTokenSize";
+  public static final String SEPARATOR_KEY = "separator";
+  final int maxOutputTokenSize;
+  final char separator;
+
+  /** Creates a new FingerprintFilterFactory */
+  public FingerprintFilterFactory(Map<String, String> args) {
+    super(args);
+    maxOutputTokenSize = getInt(args, MAX_OUTPUT_TOKEN_SIZE_KEY,
+        FingerprintFilter.DEFAULT_MAX_OUTPUT_TOKEN_SIZE);
+    separator = getChar(args, SEPARATOR_KEY,
+        FingerprintFilter.DEFAULT_SEPARATOR);
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+
+  @Override
+  public TokenStream create(TokenStream input) {
+    return new FingerprintFilter(input, maxOutputTokenSize, separator);
+  }
+
+}

Modified: lucene/dev/trunk/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory?rev=1698145&r1=1698144&r2=1698145&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory Thu Aug 27 13:11:06 2015
@@ -61,6 +61,7 @@ org.apache.lucene.analysis.lv.LatvianSte
 org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory
 org.apache.lucene.analysis.miscellaneous.CapitalizationFilterFactory
 org.apache.lucene.analysis.miscellaneous.CodepointCountFilterFactory
+org.apache.lucene.analysis.miscellaneous.FingerprintFilterFactory
 org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilterFactory
 org.apache.lucene.analysis.miscellaneous.KeepWordFilterFactory
 org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilterFactory

Added: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilter.java?rev=1698145&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilter.java (added)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilter.java Thu Aug 27 13:11:06 2015
@@ -0,0 +1,72 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+
+public class TestFingerprintFilter extends BaseTokenStreamTestCase {
+
+  public void testDupsAndSorting() throws Exception {
+    for (final boolean consumeAll : new boolean[] { true, false }) {
+      MockTokenizer tokenizer = whitespaceMockTokenizer("B A B E");
+      tokenizer.setEnableChecks(consumeAll);
+      TokenStream stream = new FingerprintFilter(tokenizer);
+      assertTokenStreamContents(stream, new String[] { "A B E" });
+    }
+  }
+
+  public void testAllDupValues() throws Exception {
+    for (final boolean consumeAll : new boolean[] { true, false }) {
+      MockTokenizer tokenizer = whitespaceMockTokenizer("B2 B2");
+      tokenizer.setEnableChecks(consumeAll);
+      TokenStream stream = new FingerprintFilter(tokenizer);
+      assertTokenStreamContents(stream, new String[] { "B2" });
+    }
+  }
+
+  public void testMaxFingerprintSize() throws Exception {
+    for (final boolean consumeAll : new boolean[] { true, false }) {
+      MockTokenizer tokenizer = whitespaceMockTokenizer("B2 A1 C3 D4 E5 F6 G7 H1");
+      tokenizer.setEnableChecks(consumeAll);
+      TokenStream stream = new FingerprintFilter(tokenizer, 4, ' ');
+      assertTokenStreamContents(stream, new String[] {});
+    }
+  }
+
+  public void testCustomSeparator() throws Exception {
+    for (final boolean consumeAll : new boolean[] { true, false }) {
+      MockTokenizer tokenizer = whitespaceMockTokenizer("B2 A1 C3 B2");
+      tokenizer.setEnableChecks(consumeAll);
+      TokenStream stream = new FingerprintFilter(tokenizer,
+          FingerprintFilter.DEFAULT_MAX_OUTPUT_TOKEN_SIZE, '_');
+      assertTokenStreamContents(stream, new String[] { "A1_B2_C3" });
+    }
+  }
+
+  public void testSingleToken() throws Exception {
+    for (final boolean consumeAll : new boolean[] { true, false }) {
+      MockTokenizer tokenizer = whitespaceMockTokenizer("A1");
+      tokenizer.setEnableChecks(consumeAll);
+      TokenStream stream = new FingerprintFilter(tokenizer);
+      assertTokenStreamContents(stream, new String[] { "A1" });
+    }
+  }
+
+}

Added: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilterFactory.java?rev=1698145&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilterFactory.java (added)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilterFactory.java Thu Aug 27 13:11:06 2015
@@ -0,0 +1,62 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
+
+public class TestFingerprintFilterFactory extends BaseTokenStreamFactoryTestCase {
+
+  public void test() throws Exception {
+    for (final boolean consumeAll : new boolean[]{true, false}) {
+      Reader reader = new StringReader("A1 B2 A1 D4 C3");
+      MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+      tokenizer.setReader(reader);
+      tokenizer.setEnableChecks(consumeAll);
+      TokenStream stream = tokenizer;
+      stream = tokenFilterFactory("Fingerprint",
+          FingerprintFilterFactory.MAX_OUTPUT_TOKEN_SIZE_KEY, "256",
+          FingerprintFilterFactory.SEPARATOR_KEY, "_"
+      ).create(stream);
+      assertTokenStreamContents(stream, new String[]{"A1_B2_C3_D4"});
+    }
+  }
+
+  public void testRequired() throws Exception {
+    // no params are required
+      tokenFilterFactory("Fingerprint");
+  }
+
+  /**
+   * Test that bogus arguments result in exception
+   */
+  public void testBogusArguments() throws Exception {
+    try {
+      tokenFilterFactory("Fingerprint",
+          FingerprintFilterFactory.MAX_OUTPUT_TOKEN_SIZE_KEY, "3",
+          "bogusArg", "bogusValue");
+      fail();
+    } catch (IllegalArgumentException expected) {
+      assertTrue(expected.getMessage().contains("Unknown parameters"));
+    }
+  }
+}