You are viewing a plain text version of this content. The canonical link for it is here.

Posted to dev@lucene.apache.org by Gus Heck <gu...@gmail.com> on 2019/04/30 18:34:58 UTC

Re: [lucene-solr] 01/02: Fix MLT like text with custom frequencies

I'm seeing precommit failures on master that appear to be from this commit.
Also it's not clear from the commit message which issue this belongs to...

[forbidden-apis] Loading classes to check...
[forbidden-apis] Scanning classes for violations...
[forbidden-apis] Forbidden method invocation:
java.lang.String#format(java.lang.String,java.lang.Object[]) [Uses default
locale]
[forbidden-apis]   in org.apache.lucene.queries.mlt.TestMoreLikeThis
(TestMoreLikeThis.java:497)
[forbidden-apis] Scanned 239 class file(s) for forbidden API invocations
(in 0.08s), 1 error(s).


On Tue, Apr 30, 2019 at 12:16 PM <mi...@apache.org> wrote:

> This is an automated email from the ASF dual-hosted git repository.
>
> mikemccand pushed a commit to branch master
> in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
>
> commit 351e21f6203e8f3aece0cd5adf4049974bd2d636
> Author: Olli Kuonanoja <ol...@nosto.com>
> AuthorDate: Mon Apr 8 16:44:30 2019 +0300
>
>     Fix MLT like text with custom frequencies
>
>     When an analyzer with custom term frequencies is used with MLT like
>     texts, the custom term frequencies are incorrectly omitted and a fixed
>     frequency of 1 is used instead.
>
>     This commit fixes the issue by using `TermFrequencyAttribute` to get
>     the term frequencies instead of using fixed 1. Also adds test cases
>     for them mentioned issue.
> ---
>  .../apache/lucene/queries/mlt/MoreLikeThis.java    | 12 +++-
>  .../lucene/queries/mlt/TestMoreLikeThis.java       | 70
> ++++++++++++++++++++++
>  2 files changed, 79 insertions(+), 3 deletions(-)
>
> diff --git
> a/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
> b/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
> index 61ebe93..7c077e5 100644
> ---
> a/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
> +++
> b/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
> @@ -28,6 +28,7 @@ import java.util.Set;
>  import org.apache.lucene.analysis.Analyzer;
>  import org.apache.lucene.analysis.TokenStream;
>  import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
> +import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
>  import org.apache.lucene.document.Document;
>  import org.apache.lucene.index.FieldInfos;
>  import org.apache.lucene.index.Fields;
> @@ -824,6 +825,7 @@ public final class MoreLikeThis {
>        int tokenCount = 0;
>        // for every token
>        CharTermAttribute termAtt =
> ts.addAttribute(CharTermAttribute.class);
> +      TermFrequencyAttribute tfAtt =
> ts.addAttribute(TermFrequencyAttribute.class);
>        ts.reset();
>        while (ts.incrementToken()) {
>          String word = termAtt.toString();
> @@ -838,9 +840,9 @@ public final class MoreLikeThis {
>          // increment frequency
>          Int cnt = termFreqMap.get(word);
>          if (cnt == null) {
> -          termFreqMap.put(word, new Int());
> +          termFreqMap.put(word, new Int(tfAtt.getTermFrequency()));
>          } else {
> -          cnt.x++;
> +          cnt.x += tfAtt.getTermFrequency();
>          }
>        }
>        ts.end();
> @@ -982,7 +984,11 @@ public final class MoreLikeThis {
>      int x;
>
>      Int() {
> -      x = 1;
> +      this(1);
> +    }
> +
> +    Int(int initialValue) {
> +      x = initialValue;
>      }
>    }
>  }
> diff --git
> a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
> b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
> index 4a60015..aeec534 100644
> ---
> a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
> +++
> b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
> @@ -27,7 +27,12 @@ import java.util.Map;
>
>  import org.apache.lucene.analysis.Analyzer;
>  import org.apache.lucene.analysis.MockAnalyzer;
> +import org.apache.lucene.analysis.MockTokenFilter;
>  import org.apache.lucene.analysis.MockTokenizer;
> +import org.apache.lucene.analysis.TokenFilter;
> +import org.apache.lucene.analysis.TokenStream;
> +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
> +import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
>  import org.apache.lucene.document.Document;
>  import org.apache.lucene.document.Field;
>  import org.apache.lucene.index.IndexReader;
> @@ -41,6 +46,7 @@ import org.apache.lucene.search.Query;
>  import org.apache.lucene.search.QueryUtils;
>  import org.apache.lucene.search.TermQuery;
>  import org.apache.lucene.store.Directory;
> +import org.apache.lucene.util.ArrayUtil;
>  import org.apache.lucene.util.LuceneTestCase;
>
>  import static org.hamcrest.core.Is.is;
> @@ -427,5 +433,69 @@ public class TestMoreLikeThis extends LuceneTestCase {
>        analyzer.close();
>      }
>    }
> +
> +  public void testCustomFrequecy() throws IOException {
> +    // define an analyzer with delimited term frequency, e.g. "foo|2
> bar|3"
> +    Analyzer analyzer = new Analyzer() {
> +
> +      @Override
> +      protected TokenStreamComponents createComponents(String fieldName) {
> +        MockTokenizer tokenizer = new
> MockTokenizer(MockTokenizer.WHITESPACE, false, 100);
> +        MockTokenFilter filt = new MockTokenFilter(tokenizer,
> MockTokenFilter.EMPTY_STOPSET);
> +        return new TokenStreamComponents(tokenizer,
> addCustomTokenFilter(filt));
> +      }
> +
> +      TokenStream addCustomTokenFilter(TokenStream input) {
> +        return new TokenFilter(input) {
> +          final CharTermAttribute termAtt =
> addAttribute(CharTermAttribute.class);
> +          final TermFrequencyAttribute tfAtt =
> addAttribute(TermFrequencyAttribute.class);
> +
> +          @Override
> +          public boolean incrementToken() throws IOException {
> +            if (input.incrementToken()) {
> +              final char[] buffer = termAtt.buffer();
> +              final int length = termAtt.length();
> +              for (int i = 0; i < length; i++) {
> +                if (buffer[i] == '|') {
> +                  termAtt.setLength(i);
> +                  i++;
> +                  tfAtt.setTermFrequency(ArrayUtil.parseInt(buffer, i,
> length - i));
> +                  return true;
> +                }
> +              }
> +              return true;
> +            }
> +            return false;
> +          }
> +        };
> +      }
> +    };
> +
> +    mlt.setAnalyzer(analyzer);
> +    mlt.setFieldNames(new String[] {"text"});
> +    mlt.setBoost(true);
> +
> +    final double boost10 = ((BooleanQuery) mlt.like("text", new
> StringReader("lucene|10 release|1")))
> +        .clauses()
> +        .stream()
> +        .map(BooleanClause::getQuery)
> +        .map(BoostQuery.class::cast)
> +        .filter(x -> ((TermQuery)
> x.getQuery()).getTerm().text().equals("lucene"))
> +        .mapToDouble(BoostQuery::getBoost)
> +        .sum();
> +
> +    final double boost1 = ((BooleanQuery) mlt.like("text", new
> StringReader("lucene|1 release|1")))
> +        .clauses()
> +        .stream()
> +        .map(BooleanClause::getQuery)
> +        .map(BoostQuery.class::cast)
> +        .filter(x -> ((TermQuery)
> x.getQuery()).getTerm().text().equals("lucene"))
> +        .mapToDouble(BoostQuery::getBoost)
> +        .sum();
> +
> +    // mlt should use the custom frequencies provided by the analyzer so
> "lucene|10" should be boosted more than "lucene|1"
> +    assertTrue(String.format("%s should be grater than %s", boost10,
> boost1), boost10 > boost1);
> +  }
> +
>    // TODO: add tests for the MoreLikeThisQuery
>  }
>
>

-- 
http://www.needhamsoftware.com (work)
http://www.the111shift.com (play)

Re: [lucene-solr] 01/02: Fix MLT like text with custom frequencies

Posted by Michael McCandless <lu...@mikemccandless.com>.

I'm sorry about this -- I saw this failure locally, thought I fixed it
before pushing, but obviously failed to.  I blame git ;)

Thank you to Christine for stepping in and fixing it!

Mike McCandless

http://blog.mikemccandless.com


On Tue, Apr 30, 2019 at 3:33 PM Gus Heck <gu...@gmail.com> wrote:

> https://builds.apache.org/view/L/view/Lucene/job/PreCommit-SOLR-Build/392/
>
> On Tue, Apr 30, 2019 at 2:37 PM Kevin Risden <kr...@apache.org> wrote:
>
>> It might be https://issues.apache.org/jira/browse/LUCENE-8756
>>
>> Kevin Risden
>>
>>
>> On Tue, Apr 30, 2019 at 2:35 PM Gus Heck <gu...@gmail.com> wrote:
>>
>>> I'm seeing precommit failures on master that appear to be from this
>>> commit. Also it's not clear from the commit message which issue this
>>> belongs to...
>>>
>>> [forbidden-apis] Loading classes to check...
>>> [forbidden-apis] Scanning classes for violations...
>>> [forbidden-apis] Forbidden method invocation:
>>> java.lang.String#format(java.lang.String,java.lang.Object[]) [Uses default
>>> locale]
>>> [forbidden-apis]   in org.apache.lucene.queries.mlt.TestMoreLikeThis
>>> (TestMoreLikeThis.java:497)
>>> [forbidden-apis] Scanned 239 class file(s) for forbidden API invocations
>>> (in 0.08s), 1 error(s).
>>>
>>>
>>> On Tue, Apr 30, 2019 at 12:16 PM <mi...@apache.org> wrote:
>>>
>>>> This is an automated email from the ASF dual-hosted git repository.
>>>>
>>>> mikemccand pushed a commit to branch master
>>>> in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
>>>>
>>>> commit 351e21f6203e8f3aece0cd5adf4049974bd2d636
>>>> Author: Olli Kuonanoja <ol...@nosto.com>
>>>> AuthorDate: Mon Apr 8 16:44:30 2019 +0300
>>>>
>>>>     Fix MLT like text with custom frequencies
>>>>
>>>>     When an analyzer with custom term frequencies is used with MLT like
>>>>     texts, the custom term frequencies are incorrectly omitted and a
>>>> fixed
>>>>     frequency of 1 is used instead.
>>>>
>>>>     This commit fixes the issue by using `TermFrequencyAttribute` to get
>>>>     the term frequencies instead of using fixed 1. Also adds test cases
>>>>     for them mentioned issue.
>>>> ---
>>>>  .../apache/lucene/queries/mlt/MoreLikeThis.java    | 12 +++-
>>>>  .../lucene/queries/mlt/TestMoreLikeThis.java       | 70
>>>> ++++++++++++++++++++++
>>>>  2 files changed, 79 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git
>>>> a/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
>>>> b/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
>>>> index 61ebe93..7c077e5 100644
>>>> ---
>>>> a/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
>>>> +++
>>>> b/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
>>>> @@ -28,6 +28,7 @@ import java.util.Set;
>>>>  import org.apache.lucene.analysis.Analyzer;
>>>>  import org.apache.lucene.analysis.TokenStream;
>>>>  import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
>>>> +import
>>>> org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
>>>>  import org.apache.lucene.document.Document;
>>>>  import org.apache.lucene.index.FieldInfos;
>>>>  import org.apache.lucene.index.Fields;
>>>> @@ -824,6 +825,7 @@ public final class MoreLikeThis {
>>>>        int tokenCount = 0;
>>>>        // for every token
>>>>        CharTermAttribute termAtt =
>>>> ts.addAttribute(CharTermAttribute.class);
>>>> +      TermFrequencyAttribute tfAtt =
>>>> ts.addAttribute(TermFrequencyAttribute.class);
>>>>        ts.reset();
>>>>        while (ts.incrementToken()) {
>>>>          String word = termAtt.toString();
>>>> @@ -838,9 +840,9 @@ public final class MoreLikeThis {
>>>>          // increment frequency
>>>>          Int cnt = termFreqMap.get(word);
>>>>          if (cnt == null) {
>>>> -          termFreqMap.put(word, new Int());
>>>> +          termFreqMap.put(word, new Int(tfAtt.getTermFrequency()));
>>>>          } else {
>>>> -          cnt.x++;
>>>> +          cnt.x += tfAtt.getTermFrequency();
>>>>          }
>>>>        }
>>>>        ts.end();
>>>> @@ -982,7 +984,11 @@ public final class MoreLikeThis {
>>>>      int x;
>>>>
>>>>      Int() {
>>>> -      x = 1;
>>>> +      this(1);
>>>> +    }
>>>> +
>>>> +    Int(int initialValue) {
>>>> +      x = initialValue;
>>>>      }
>>>>    }
>>>>  }
>>>> diff --git
>>>> a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
>>>> b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
>>>> index 4a60015..aeec534 100644
>>>> ---
>>>> a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
>>>> +++
>>>> b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
>>>> @@ -27,7 +27,12 @@ import java.util.Map;
>>>>
>>>>  import org.apache.lucene.analysis.Analyzer;
>>>>  import org.apache.lucene.analysis.MockAnalyzer;
>>>> +import org.apache.lucene.analysis.MockTokenFilter;
>>>>  import org.apache.lucene.analysis.MockTokenizer;
>>>> +import org.apache.lucene.analysis.TokenFilter;
>>>> +import org.apache.lucene.analysis.TokenStream;
>>>> +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
>>>> +import
>>>> org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
>>>>  import org.apache.lucene.document.Document;
>>>>  import org.apache.lucene.document.Field;
>>>>  import org.apache.lucene.index.IndexReader;
>>>> @@ -41,6 +46,7 @@ import org.apache.lucene.search.Query;
>>>>  import org.apache.lucene.search.QueryUtils;
>>>>  import org.apache.lucene.search.TermQuery;
>>>>  import org.apache.lucene.store.Directory;
>>>> +import org.apache.lucene.util.ArrayUtil;
>>>>  import org.apache.lucene.util.LuceneTestCase;
>>>>
>>>>  import static org.hamcrest.core.Is.is;
>>>> @@ -427,5 +433,69 @@ public class TestMoreLikeThis extends
>>>> LuceneTestCase {
>>>>        analyzer.close();
>>>>      }
>>>>    }
>>>> +
>>>> +  public void testCustomFrequecy() throws IOException {
>>>> +    // define an analyzer with delimited term frequency, e.g. "foo|2
>>>> bar|3"
>>>> +    Analyzer analyzer = new Analyzer() {
>>>> +
>>>> +      @Override
>>>> +      protected TokenStreamComponents createComponents(String
>>>> fieldName) {
>>>> +        MockTokenizer tokenizer = new
>>>> MockTokenizer(MockTokenizer.WHITESPACE, false, 100);
>>>> +        MockTokenFilter filt = new MockTokenFilter(tokenizer,
>>>> MockTokenFilter.EMPTY_STOPSET);
>>>> +        return new TokenStreamComponents(tokenizer,
>>>> addCustomTokenFilter(filt));
>>>> +      }
>>>> +
>>>> +      TokenStream addCustomTokenFilter(TokenStream input) {
>>>> +        return new TokenFilter(input) {
>>>> +          final CharTermAttribute termAtt =
>>>> addAttribute(CharTermAttribute.class);
>>>> +          final TermFrequencyAttribute tfAtt =
>>>> addAttribute(TermFrequencyAttribute.class);
>>>> +
>>>> +          @Override
>>>> +          public boolean incrementToken() throws IOException {
>>>> +            if (input.incrementToken()) {
>>>> +              final char[] buffer = termAtt.buffer();
>>>> +              final int length = termAtt.length();
>>>> +              for (int i = 0; i < length; i++) {
>>>> +                if (buffer[i] == '|') {
>>>> +                  termAtt.setLength(i);
>>>> +                  i++;
>>>> +                  tfAtt.setTermFrequency(ArrayUtil.parseInt(buffer, i,
>>>> length - i));
>>>> +                  return true;
>>>> +                }
>>>> +              }
>>>> +              return true;
>>>> +            }
>>>> +            return false;
>>>> +          }
>>>> +        };
>>>> +      }
>>>> +    };
>>>> +
>>>> +    mlt.setAnalyzer(analyzer);
>>>> +    mlt.setFieldNames(new String[] {"text"});
>>>> +    mlt.setBoost(true);
>>>> +
>>>> +    final double boost10 = ((BooleanQuery) mlt.like("text", new
>>>> StringReader("lucene|10 release|1")))
>>>> +        .clauses()
>>>> +        .stream()
>>>> +        .map(BooleanClause::getQuery)
>>>> +        .map(BoostQuery.class::cast)
>>>> +        .filter(x -> ((TermQuery)
>>>> x.getQuery()).getTerm().text().equals("lucene"))
>>>> +        .mapToDouble(BoostQuery::getBoost)
>>>> +        .sum();
>>>> +
>>>> +    final double boost1 = ((BooleanQuery) mlt.like("text", new
>>>> StringReader("lucene|1 release|1")))
>>>> +        .clauses()
>>>> +        .stream()
>>>> +        .map(BooleanClause::getQuery)
>>>> +        .map(BoostQuery.class::cast)
>>>> +        .filter(x -> ((TermQuery)
>>>> x.getQuery()).getTerm().text().equals("lucene"))
>>>> +        .mapToDouble(BoostQuery::getBoost)
>>>> +        .sum();
>>>> +
>>>> +    // mlt should use the custom frequencies provided by the analyzer
>>>> so "lucene|10" should be boosted more than "lucene|1"
>>>> +    assertTrue(String.format("%s should be grater than %s", boost10,
>>>> boost1), boost10 > boost1);
>>>> +  }
>>>> +
>>>>    // TODO: add tests for the MoreLikeThisQuery
>>>>  }
>>>>
>>>>
>>>
>>> --
>>> http://www.needhamsoftware.com (work)
>>> http://www.the111shift.com (play)
>>>
>>
>
> --
> http://www.needhamsoftware.com (work)
> http://www.the111shift.com (play)
>

Re: [lucene-solr] 01/02: Fix MLT like text with custom frequencies

Posted by Gus Heck <gu...@gmail.com>.

https://builds.apache.org/view/L/view/Lucene/job/PreCommit-SOLR-Build/392/

On Tue, Apr 30, 2019 at 2:37 PM Kevin Risden <kr...@apache.org> wrote:

> It might be https://issues.apache.org/jira/browse/LUCENE-8756
>
> Kevin Risden
>
>
> On Tue, Apr 30, 2019 at 2:35 PM Gus Heck <gu...@gmail.com> wrote:
>
>> I'm seeing precommit failures on master that appear to be from this
>> commit. Also it's not clear from the commit message which issue this
>> belongs to...
>>
>> [forbidden-apis] Loading classes to check...
>> [forbidden-apis] Scanning classes for violations...
>> [forbidden-apis] Forbidden method invocation:
>> java.lang.String#format(java.lang.String,java.lang.Object[]) [Uses default
>> locale]
>> [forbidden-apis]   in org.apache.lucene.queries.mlt.TestMoreLikeThis
>> (TestMoreLikeThis.java:497)
>> [forbidden-apis] Scanned 239 class file(s) for forbidden API invocations
>> (in 0.08s), 1 error(s).
>>
>>
>> On Tue, Apr 30, 2019 at 12:16 PM <mi...@apache.org> wrote:
>>
>>> This is an automated email from the ASF dual-hosted git repository.
>>>
>>> mikemccand pushed a commit to branch master
>>> in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
>>>
>>> commit 351e21f6203e8f3aece0cd5adf4049974bd2d636
>>> Author: Olli Kuonanoja <ol...@nosto.com>
>>> AuthorDate: Mon Apr 8 16:44:30 2019 +0300
>>>
>>>     Fix MLT like text with custom frequencies
>>>
>>>     When an analyzer with custom term frequencies is used with MLT like
>>>     texts, the custom term frequencies are incorrectly omitted and a
>>> fixed
>>>     frequency of 1 is used instead.
>>>
>>>     This commit fixes the issue by using `TermFrequencyAttribute` to get
>>>     the term frequencies instead of using fixed 1. Also adds test cases
>>>     for them mentioned issue.
>>> ---
>>>  .../apache/lucene/queries/mlt/MoreLikeThis.java    | 12 +++-
>>>  .../lucene/queries/mlt/TestMoreLikeThis.java       | 70
>>> ++++++++++++++++++++++
>>>  2 files changed, 79 insertions(+), 3 deletions(-)
>>>
>>> diff --git
>>> a/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
>>> b/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
>>> index 61ebe93..7c077e5 100644
>>> ---
>>> a/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
>>> +++
>>> b/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
>>> @@ -28,6 +28,7 @@ import java.util.Set;
>>>  import org.apache.lucene.analysis.Analyzer;
>>>  import org.apache.lucene.analysis.TokenStream;
>>>  import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
>>> +import
>>> org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
>>>  import org.apache.lucene.document.Document;
>>>  import org.apache.lucene.index.FieldInfos;
>>>  import org.apache.lucene.index.Fields;
>>> @@ -824,6 +825,7 @@ public final class MoreLikeThis {
>>>        int tokenCount = 0;
>>>        // for every token
>>>        CharTermAttribute termAtt =
>>> ts.addAttribute(CharTermAttribute.class);
>>> +      TermFrequencyAttribute tfAtt =
>>> ts.addAttribute(TermFrequencyAttribute.class);
>>>        ts.reset();
>>>        while (ts.incrementToken()) {
>>>          String word = termAtt.toString();
>>> @@ -838,9 +840,9 @@ public final class MoreLikeThis {
>>>          // increment frequency
>>>          Int cnt = termFreqMap.get(word);
>>>          if (cnt == null) {
>>> -          termFreqMap.put(word, new Int());
>>> +          termFreqMap.put(word, new Int(tfAtt.getTermFrequency()));
>>>          } else {
>>> -          cnt.x++;
>>> +          cnt.x += tfAtt.getTermFrequency();
>>>          }
>>>        }
>>>        ts.end();
>>> @@ -982,7 +984,11 @@ public final class MoreLikeThis {
>>>      int x;
>>>
>>>      Int() {
>>> -      x = 1;
>>> +      this(1);
>>> +    }
>>> +
>>> +    Int(int initialValue) {
>>> +      x = initialValue;
>>>      }
>>>    }
>>>  }
>>> diff --git
>>> a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
>>> b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
>>> index 4a60015..aeec534 100644
>>> ---
>>> a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
>>> +++
>>> b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
>>> @@ -27,7 +27,12 @@ import java.util.Map;
>>>
>>>  import org.apache.lucene.analysis.Analyzer;
>>>  import org.apache.lucene.analysis.MockAnalyzer;
>>> +import org.apache.lucene.analysis.MockTokenFilter;
>>>  import org.apache.lucene.analysis.MockTokenizer;
>>> +import org.apache.lucene.analysis.TokenFilter;
>>> +import org.apache.lucene.analysis.TokenStream;
>>> +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
>>> +import
>>> org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
>>>  import org.apache.lucene.document.Document;
>>>  import org.apache.lucene.document.Field;
>>>  import org.apache.lucene.index.IndexReader;
>>> @@ -41,6 +46,7 @@ import org.apache.lucene.search.Query;
>>>  import org.apache.lucene.search.QueryUtils;
>>>  import org.apache.lucene.search.TermQuery;
>>>  import org.apache.lucene.store.Directory;
>>> +import org.apache.lucene.util.ArrayUtil;
>>>  import org.apache.lucene.util.LuceneTestCase;
>>>
>>>  import static org.hamcrest.core.Is.is;
>>> @@ -427,5 +433,69 @@ public class TestMoreLikeThis extends
>>> LuceneTestCase {
>>>        analyzer.close();
>>>      }
>>>    }
>>> +
>>> +  public void testCustomFrequecy() throws IOException {
>>> +    // define an analyzer with delimited term frequency, e.g. "foo|2
>>> bar|3"
>>> +    Analyzer analyzer = new Analyzer() {
>>> +
>>> +      @Override
>>> +      protected TokenStreamComponents createComponents(String
>>> fieldName) {
>>> +        MockTokenizer tokenizer = new
>>> MockTokenizer(MockTokenizer.WHITESPACE, false, 100);
>>> +        MockTokenFilter filt = new MockTokenFilter(tokenizer,
>>> MockTokenFilter.EMPTY_STOPSET);
>>> +        return new TokenStreamComponents(tokenizer,
>>> addCustomTokenFilter(filt));
>>> +      }
>>> +
>>> +      TokenStream addCustomTokenFilter(TokenStream input) {
>>> +        return new TokenFilter(input) {
>>> +          final CharTermAttribute termAtt =
>>> addAttribute(CharTermAttribute.class);
>>> +          final TermFrequencyAttribute tfAtt =
>>> addAttribute(TermFrequencyAttribute.class);
>>> +
>>> +          @Override
>>> +          public boolean incrementToken() throws IOException {
>>> +            if (input.incrementToken()) {
>>> +              final char[] buffer = termAtt.buffer();
>>> +              final int length = termAtt.length();
>>> +              for (int i = 0; i < length; i++) {
>>> +                if (buffer[i] == '|') {
>>> +                  termAtt.setLength(i);
>>> +                  i++;
>>> +                  tfAtt.setTermFrequency(ArrayUtil.parseInt(buffer, i,
>>> length - i));
>>> +                  return true;
>>> +                }
>>> +              }
>>> +              return true;
>>> +            }
>>> +            return false;
>>> +          }
>>> +        };
>>> +      }
>>> +    };
>>> +
>>> +    mlt.setAnalyzer(analyzer);
>>> +    mlt.setFieldNames(new String[] {"text"});
>>> +    mlt.setBoost(true);
>>> +
>>> +    final double boost10 = ((BooleanQuery) mlt.like("text", new
>>> StringReader("lucene|10 release|1")))
>>> +        .clauses()
>>> +        .stream()
>>> +        .map(BooleanClause::getQuery)
>>> +        .map(BoostQuery.class::cast)
>>> +        .filter(x -> ((TermQuery)
>>> x.getQuery()).getTerm().text().equals("lucene"))
>>> +        .mapToDouble(BoostQuery::getBoost)
>>> +        .sum();
>>> +
>>> +    final double boost1 = ((BooleanQuery) mlt.like("text", new
>>> StringReader("lucene|1 release|1")))
>>> +        .clauses()
>>> +        .stream()
>>> +        .map(BooleanClause::getQuery)
>>> +        .map(BoostQuery.class::cast)
>>> +        .filter(x -> ((TermQuery)
>>> x.getQuery()).getTerm().text().equals("lucene"))
>>> +        .mapToDouble(BoostQuery::getBoost)
>>> +        .sum();
>>> +
>>> +    // mlt should use the custom frequencies provided by the analyzer
>>> so "lucene|10" should be boosted more than "lucene|1"
>>> +    assertTrue(String.format("%s should be grater than %s", boost10,
>>> boost1), boost10 > boost1);
>>> +  }
>>> +
>>>    // TODO: add tests for the MoreLikeThisQuery
>>>  }
>>>
>>>
>>
>> --
>> http://www.needhamsoftware.com (work)
>> http://www.the111shift.com (play)
>>
>

-- 
http://www.needhamsoftware.com (work)
http://www.the111shift.com (play)

Re: [lucene-solr] 01/02: Fix MLT like text with custom frequencies

Posted by Kevin Risden <kr...@apache.org>.

It might be https://issues.apache.org/jira/browse/LUCENE-8756

Kevin Risden


On Tue, Apr 30, 2019 at 2:35 PM Gus Heck <gu...@gmail.com> wrote:

> I'm seeing precommit failures on master that appear to be from this
> commit. Also it's not clear from the commit message which issue this
> belongs to...
>
> [forbidden-apis] Loading classes to check...
> [forbidden-apis] Scanning classes for violations...
> [forbidden-apis] Forbidden method invocation:
> java.lang.String#format(java.lang.String,java.lang.Object[]) [Uses default
> locale]
> [forbidden-apis]   in org.apache.lucene.queries.mlt.TestMoreLikeThis
> (TestMoreLikeThis.java:497)
> [forbidden-apis] Scanned 239 class file(s) for forbidden API invocations
> (in 0.08s), 1 error(s).
>
>
> On Tue, Apr 30, 2019 at 12:16 PM <mi...@apache.org> wrote:
>
>> This is an automated email from the ASF dual-hosted git repository.
>>
>> mikemccand pushed a commit to branch master
>> in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
>>
>> commit 351e21f6203e8f3aece0cd5adf4049974bd2d636
>> Author: Olli Kuonanoja <ol...@nosto.com>
>> AuthorDate: Mon Apr 8 16:44:30 2019 +0300
>>
>>     Fix MLT like text with custom frequencies
>>
>>     When an analyzer with custom term frequencies is used with MLT like
>>     texts, the custom term frequencies are incorrectly omitted and a fixed
>>     frequency of 1 is used instead.
>>
>>     This commit fixes the issue by using `TermFrequencyAttribute` to get
>>     the term frequencies instead of using fixed 1. Also adds test cases
>>     for them mentioned issue.
>> ---
>>  .../apache/lucene/queries/mlt/MoreLikeThis.java    | 12 +++-
>>  .../lucene/queries/mlt/TestMoreLikeThis.java       | 70
>> ++++++++++++++++++++++
>>  2 files changed, 79 insertions(+), 3 deletions(-)
>>
>> diff --git
>> a/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
>> b/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
>> index 61ebe93..7c077e5 100644
>> ---
>> a/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
>> +++
>> b/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
>> @@ -28,6 +28,7 @@ import java.util.Set;
>>  import org.apache.lucene.analysis.Analyzer;
>>  import org.apache.lucene.analysis.TokenStream;
>>  import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
>> +import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
>>  import org.apache.lucene.document.Document;
>>  import org.apache.lucene.index.FieldInfos;
>>  import org.apache.lucene.index.Fields;
>> @@ -824,6 +825,7 @@ public final class MoreLikeThis {
>>        int tokenCount = 0;
>>        // for every token
>>        CharTermAttribute termAtt =
>> ts.addAttribute(CharTermAttribute.class);
>> +      TermFrequencyAttribute tfAtt =
>> ts.addAttribute(TermFrequencyAttribute.class);
>>        ts.reset();
>>        while (ts.incrementToken()) {
>>          String word = termAtt.toString();
>> @@ -838,9 +840,9 @@ public final class MoreLikeThis {
>>          // increment frequency
>>          Int cnt = termFreqMap.get(word);
>>          if (cnt == null) {
>> -          termFreqMap.put(word, new Int());
>> +          termFreqMap.put(word, new Int(tfAtt.getTermFrequency()));
>>          } else {
>> -          cnt.x++;
>> +          cnt.x += tfAtt.getTermFrequency();
>>          }
>>        }
>>        ts.end();
>> @@ -982,7 +984,11 @@ public final class MoreLikeThis {
>>      int x;
>>
>>      Int() {
>> -      x = 1;
>> +      this(1);
>> +    }
>> +
>> +    Int(int initialValue) {
>> +      x = initialValue;
>>      }
>>    }
>>  }
>> diff --git
>> a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
>> b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
>> index 4a60015..aeec534 100644
>> ---
>> a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
>> +++
>> b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
>> @@ -27,7 +27,12 @@ import java.util.Map;
>>
>>  import org.apache.lucene.analysis.Analyzer;
>>  import org.apache.lucene.analysis.MockAnalyzer;
>> +import org.apache.lucene.analysis.MockTokenFilter;
>>  import org.apache.lucene.analysis.MockTokenizer;
>> +import org.apache.lucene.analysis.TokenFilter;
>> +import org.apache.lucene.analysis.TokenStream;
>> +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
>> +import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
>>  import org.apache.lucene.document.Document;
>>  import org.apache.lucene.document.Field;
>>  import org.apache.lucene.index.IndexReader;
>> @@ -41,6 +46,7 @@ import org.apache.lucene.search.Query;
>>  import org.apache.lucene.search.QueryUtils;
>>  import org.apache.lucene.search.TermQuery;
>>  import org.apache.lucene.store.Directory;
>> +import org.apache.lucene.util.ArrayUtil;
>>  import org.apache.lucene.util.LuceneTestCase;
>>
>>  import static org.hamcrest.core.Is.is;
>> @@ -427,5 +433,69 @@ public class TestMoreLikeThis extends LuceneTestCase
>> {
>>        analyzer.close();
>>      }
>>    }
>> +
>> +  public void testCustomFrequecy() throws IOException {
>> +    // define an analyzer with delimited term frequency, e.g. "foo|2
>> bar|3"
>> +    Analyzer analyzer = new Analyzer() {
>> +
>> +      @Override
>> +      protected TokenStreamComponents createComponents(String fieldName)
>> {
>> +        MockTokenizer tokenizer = new
>> MockTokenizer(MockTokenizer.WHITESPACE, false, 100);
>> +        MockTokenFilter filt = new MockTokenFilter(tokenizer,
>> MockTokenFilter.EMPTY_STOPSET);
>> +        return new TokenStreamComponents(tokenizer,
>> addCustomTokenFilter(filt));
>> +      }
>> +
>> +      TokenStream addCustomTokenFilter(TokenStream input) {
>> +        return new TokenFilter(input) {
>> +          final CharTermAttribute termAtt =
>> addAttribute(CharTermAttribute.class);
>> +          final TermFrequencyAttribute tfAtt =
>> addAttribute(TermFrequencyAttribute.class);
>> +
>> +          @Override
>> +          public boolean incrementToken() throws IOException {
>> +            if (input.incrementToken()) {
>> +              final char[] buffer = termAtt.buffer();
>> +              final int length = termAtt.length();
>> +              for (int i = 0; i < length; i++) {
>> +                if (buffer[i] == '|') {
>> +                  termAtt.setLength(i);
>> +                  i++;
>> +                  tfAtt.setTermFrequency(ArrayUtil.parseInt(buffer, i,
>> length - i));
>> +                  return true;
>> +                }
>> +              }
>> +              return true;
>> +            }
>> +            return false;
>> +          }
>> +        };
>> +      }
>> +    };
>> +
>> +    mlt.setAnalyzer(analyzer);
>> +    mlt.setFieldNames(new String[] {"text"});
>> +    mlt.setBoost(true);
>> +
>> +    final double boost10 = ((BooleanQuery) mlt.like("text", new
>> StringReader("lucene|10 release|1")))
>> +        .clauses()
>> +        .stream()
>> +        .map(BooleanClause::getQuery)
>> +        .map(BoostQuery.class::cast)
>> +        .filter(x -> ((TermQuery)
>> x.getQuery()).getTerm().text().equals("lucene"))
>> +        .mapToDouble(BoostQuery::getBoost)
>> +        .sum();
>> +
>> +    final double boost1 = ((BooleanQuery) mlt.like("text", new
>> StringReader("lucene|1 release|1")))
>> +        .clauses()
>> +        .stream()
>> +        .map(BooleanClause::getQuery)
>> +        .map(BoostQuery.class::cast)
>> +        .filter(x -> ((TermQuery)
>> x.getQuery()).getTerm().text().equals("lucene"))
>> +        .mapToDouble(BoostQuery::getBoost)
>> +        .sum();
>> +
>> +    // mlt should use the custom frequencies provided by the analyzer so
>> "lucene|10" should be boosted more than "lucene|1"
>> +    assertTrue(String.format("%s should be grater than %s", boost10,
>> boost1), boost10 > boost1);
>> +  }
>> +
>>    // TODO: add tests for the MoreLikeThisQuery
>>  }
>>
>>
>
> --
> http://www.needhamsoftware.com (work)
> http://www.the111shift.com (play)
>