You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by to...@apache.org on 2019/03/26 10:27:27 UTC
[lucene-solr] branch master updated: LUCENE-6687 - avoid
unnecessary looping
This is an automated email from the ASF dual-hosted git repository.
tommaso pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/master by this push:
new 42a548e LUCENE-6687 - avoid unnecessary looping
42a548e is described below
commit 42a548e28efd74e283bfafaf6dabe7ebe01251e5
Author: Tommaso Teofili <te...@adobe.com>
AuthorDate: Tue Mar 26 11:27:09 2019 +0100
LUCENE-6687 - avoid unnecessary looping
---
.../apache/lucene/queries/mlt/MoreLikeThis.java | 16 +-
.../lucene/queries/mlt/TestMoreLikeThis.java | 193 ++++++++++++++++-----
2 files changed, 153 insertions(+), 56 deletions(-)
diff --git a/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java b/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
index b71314b..61ebe93 100644
--- a/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
+++ b/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
@@ -763,15 +763,13 @@ public final class MoreLikeThis {
IOException {
Map<String, Map<String, Int>> field2termFreqMap = new HashMap<>();
for (String fieldName : fieldNames) {
- for (String field : field2fieldValues.keySet()) {
- Collection<Object> fieldValues = field2fieldValues.get(field);
- if(fieldValues == null)
- continue;
- for(Object fieldValue:fieldValues) {
- if (fieldValue != null) {
- addTermFrequencies(new StringReader(String.valueOf(fieldValue)), field2termFreqMap,
- fieldName);
- }
+ Collection<Object> fieldValues = field2fieldValues.get(fieldName);
+ if (fieldValues == null)
+ continue;
+ for (Object fieldValue : fieldValues) {
+ if (fieldValue != null) {
+ addTermFrequencies(new StringReader(String.valueOf(fieldValue)), field2termFreqMap,
+ fieldName);
}
}
}
diff --git a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
index 5f18840..4a60015 100644
--- a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
+++ b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
@@ -22,6 +22,7 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
@@ -42,15 +43,22 @@ import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
+import static org.hamcrest.core.Is.is;
+
public class TestMoreLikeThis extends LuceneTestCase {
private static final String SHOP_TYPE = "type";
private static final String FOR_SALE = "weSell";
private static final String NOT_FOR_SALE = "weDontSell";
+ private static final int MIN_DOC_FREQ = 0;
+ private static final int MIN_WORD_LEN = 1;
+ private static final int MIN_TERM_FREQ = 1;
private Directory directory;
private IndexReader reader;
private IndexSearcher searcher;
+ private MoreLikeThis mlt;
+ private Analyzer analyzer;
@Override
public void setUp() throws Exception {
@@ -59,46 +67,64 @@ public class TestMoreLikeThis extends LuceneTestCase {
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
// Add series of docs with specific information for MoreLikeThis
- addDoc(writer, "lucene");
- addDoc(writer, "lucene release");
- addDoc(writer, "apache");
- addDoc(writer, "apache lucene");
+ addDoc(writer, "text", "lucene");
+ addDoc(writer, "text", "lucene release");
+ addDoc(writer, "text", "apache");
+ addDoc(writer, "text", "apache lucene");
+
+ // one more time to increase the doc frequencies
+ addDoc(writer, "text","lucene2");
+ addDoc(writer, "text", "lucene2 release2");
+ addDoc(writer, "text", "apache2");
+ addDoc(writer, "text", "apache2 lucene2");
+
+ addDoc(writer, "text2","lucene2");
+ addDoc(writer, "text2", "lucene2 release2");
+ addDoc(writer, "text2", "apache2");
+ addDoc(writer, "text2", "apache2 lucene2");
reader = writer.getReader();
writer.close();
searcher = newSearcher(reader);
+ analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
+ mlt = this.getDefaultMoreLikeThis(reader);
}
+
+ private MoreLikeThis getDefaultMoreLikeThis(IndexReader reader) {
+ MoreLikeThis mlt = new MoreLikeThis(reader);
+ Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
+ mlt.setAnalyzer(analyzer);
+ mlt.setMinDocFreq(MIN_DOC_FREQ);
+ mlt.setMinTermFreq(MIN_TERM_FREQ);
+ mlt.setMinWordLen(MIN_WORD_LEN);
+ return mlt;
+ }
+
@Override
public void tearDown() throws Exception {
+ analyzer.close();
reader.close();
directory.close();
super.tearDown();
}
- private void addDoc(RandomIndexWriter writer, String text) throws IOException {
+ private void addDoc(RandomIndexWriter writer, String fieldName, String text) throws IOException {
Document doc = new Document();
- doc.add(newTextField("text", text, Field.Store.YES));
+ doc.add(newTextField(fieldName, text, Field.Store.YES));
writer.addDocument(doc);
}
- private void addDoc(RandomIndexWriter writer, String[] texts) throws IOException {
+ private void addDoc(RandomIndexWriter writer, String fieldName, String[] texts) throws IOException {
Document doc = new Document();
for (String text : texts) {
- doc.add(newTextField("text", text, Field.Store.YES));
+ doc.add(newTextField(fieldName, text, Field.Store.YES));
}
writer.addDocument(doc);
}
public void testBoostFactor() throws Throwable {
Map<String,Float> originalValues = getOriginalValues();
-
- MoreLikeThis mlt = new MoreLikeThis(reader);
- Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
- mlt.setAnalyzer(analyzer);
- mlt.setMinDocFreq(1);
- mlt.setMinTermFreq(1);
- mlt.setMinWordLen(1);
mlt.setFieldNames(new String[] {"text"});
mlt.setBoost(true);
@@ -125,17 +151,10 @@ public class TestMoreLikeThis extends LuceneTestCase {
+ tq.getTerm().text() + "' got " + bq.getBoost(), totalBoost, bq
.getBoost(), 0.0001);
}
- analyzer.close();
}
private Map<String,Float> getOriginalValues() throws IOException {
Map<String,Float> originalValues = new HashMap<>();
- MoreLikeThis mlt = new MoreLikeThis(reader);
- Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
- mlt.setAnalyzer(analyzer);
- mlt.setMinDocFreq(1);
- mlt.setMinTermFreq(1);
- mlt.setMinWordLen(1);
mlt.setFieldNames(new String[] {"text"});
mlt.setBoost(true);
BooleanQuery query = (BooleanQuery) mlt.like("text", new StringReader(
@@ -147,33 +166,21 @@ public class TestMoreLikeThis extends LuceneTestCase {
TermQuery tq = (TermQuery) bq.getQuery();
originalValues.put(tq.getTerm().text(), bq.getBoost());
}
- analyzer.close();
return originalValues;
}
// LUCENE-3326
public void testMultiFields() throws Exception {
- MoreLikeThis mlt = new MoreLikeThis(reader);
- Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
- mlt.setAnalyzer(analyzer);
- mlt.setMinDocFreq(1);
- mlt.setMinTermFreq(1);
- mlt.setMinWordLen(1);
mlt.setFieldNames(new String[] {"text", "foobar"});
mlt.like("foobar", new StringReader("this is a test"));
- analyzer.close();
}
// LUCENE-5725
public void testMultiValues() throws Exception {
- MoreLikeThis mlt = new MoreLikeThis(reader);
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false);
mlt.setAnalyzer(analyzer);
- mlt.setMinDocFreq(1);
- mlt.setMinTermFreq(1);
- mlt.setMinWordLen(1);
mlt.setFieldNames(new String[] {"text"});
-
+
BooleanQuery query = (BooleanQuery) mlt.like("text",
new StringReader("lucene"), new StringReader("lucene release"),
new StringReader("apache"), new StringReader("apache lucene"));
@@ -183,9 +190,110 @@ public class TestMoreLikeThis extends LuceneTestCase {
Term term = ((TermQuery) clause.getQuery()).getTerm();
assertTrue(Arrays.asList(new Term("text", "lucene"), new Term("text", "apache")).contains(term));
}
+ }
+
+ public void testSeedDocumentMap_minTermFrequencySet_shouldBuildQueryAccordingToCorrectTermFrequencies() throws Exception {
+ mlt.setMinTermFreq(3);
+
+ String mltField1 = "text";
+ String mltField2 = "text2";
+ mlt.setFieldNames(new String[]{mltField1, mltField2});
+
+ Map<String, Collection<Object>> seedDocument = new HashMap<>();
+ String textValue = "apache apache lucene lucene lucene";
+ seedDocument.put(mltField1, Arrays.asList(textValue));
+ seedDocument.put(mltField2, Arrays.asList(textValue));
+
+ BooleanQuery query = (BooleanQuery) mlt.like(seedDocument);
+ Collection<BooleanClause> clauses = query.clauses();
+ assertEquals("Expected 1 clauses only!", 1, clauses.size());
+ for (BooleanClause clause : clauses) {
+ Term term = ((TermQuery) clause.getQuery()).getTerm();
+ assertThat(term, is(new Term(mltField1, "lucene")));
+ }
analyzer.close();
}
+ public void testSeedDocumentMap_minTermFrequencySetMltFieldSet_shouldBuildQueryAccordingToCorrectTermFrequenciesAndField() throws Exception {
+ mlt.setMinTermFreq(3);
+
+ String mltField = "text";
+ mlt.setFieldNames(new String[]{mltField});
+
+ Map<String, Collection<Object>> seedDocument = new HashMap<>();
+ String sampleField2 = "text2";
+ String textValue1 = "apache apache lucene lucene";
+ String textValue2 = "apache2 apache2 lucene2 lucene2 lucene2";
+ seedDocument.put(mltField, Arrays.asList(textValue1));
+ seedDocument.put(sampleField2, Arrays.asList(textValue2));
+
+ BooleanQuery query = (BooleanQuery) mlt.like(seedDocument);
+ Collection<BooleanClause> clauses = query.clauses();
+
+ HashSet<Term> unexpectedTerms = new HashSet<>();
+ unexpectedTerms.add(new Term(mltField, "apache"));//Term Frequency < Minimum Accepted Term Frequency
+ unexpectedTerms.add(new Term(mltField, "lucene"));//Term Frequency < Minimum Accepted Term Frequency
+ unexpectedTerms.add(new Term(mltField, "apache2"));//Term Frequency < Minimum Accepted Term Frequency
+ unexpectedTerms.add(new Term(mltField, "lucene2"));//Wrong Field
+
+ //None of the Not Expected terms is in the query
+ for (BooleanClause clause : clauses) {
+ Term term = ((TermQuery) clause.getQuery()).getTerm();
+ assertFalse("Unexpected term '" + term + "' found in query terms", unexpectedTerms.contains(term));
+ }
+
+ assertEquals("Expected 0 clauses only!", 0, clauses.size());
+
+ analyzer.close();
+ }
+
+ public void testSeedDocumentMap_queryFieldsSet_shouldBuildQueryFromSpecifiedFieldnamesOnly() throws Exception {
+ mlt.setMinTermFreq(2);
+
+ String mltField = "text";
+
+ mlt.setFieldNames(new String[]{mltField});
+
+ Map<String, Collection<Object>> seedDocument = new HashMap<>();
+ String notMltField = "text2";
+ String textValue1 = "apache apache lucene lucene";
+ String textValue2 = "apache2 apache2 lucene2 lucene2 lucene2";
+ seedDocument.put(mltField, Arrays.asList(textValue1));
+ seedDocument.put(notMltField, Arrays.asList(textValue2));
+
+ HashSet<Term> expectedTerms = new HashSet<>();
+ expectedTerms.add(new Term(mltField, "apache"));
+ expectedTerms.add(new Term(mltField, "lucene"));
+
+ HashSet<Term> unexpectedTerms = new HashSet<>();
+ unexpectedTerms.add(new Term(mltField, "apache2"));
+ unexpectedTerms.add(new Term(mltField, "lucene2"));
+ unexpectedTerms.add(new Term(notMltField, "apache2"));
+ unexpectedTerms.add(new Term(notMltField, "lucene2"));
+
+ BooleanQuery query = (BooleanQuery) mlt.like(seedDocument);
+ Collection<BooleanClause> clauses = query.clauses();
+ HashSet<Term> clausesTerms = new HashSet<>();
+ for (BooleanClause clause : clauses) {
+ Term term = ((TermQuery) clause.getQuery()).getTerm();
+ clausesTerms.add(term);
+ }
+
+ assertEquals("Expected 2 clauses only!", 2, clauses.size());
+
+ //None of the Not Expected terms is in the query
+ for (BooleanClause clause : clauses) {
+ Term term = ((TermQuery) clause.getQuery()).getTerm();
+ assertFalse("Unexpected term '" + term + "' found in query terms", unexpectedTerms.contains(term));
+ }
+
+ //All of the Expected terms are in the query
+ for (Term expectedTerm : expectedTerms) {
+ assertTrue("Expected term '" + expectedTerm + "' is not found in query terms", clausesTerms.contains(expectedTerm));
+ }
+
+ }
+
// just basic equals/hashcode etc
public void testMoreLikeThisQuery() throws Exception {
Analyzer analyzer = new MockAnalyzer(random());
@@ -202,19 +310,14 @@ public class TestMoreLikeThis extends LuceneTestCase {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
for (int i = 0; i < numDocs; i++) {
- addDoc(writer, generateStrSeq(0, i + 1));
+ addDoc(writer, "text", generateStrSeq(0, i + 1));
}
IndexReader reader = writer.getReader();
writer.close();
// setup MLT query
- MoreLikeThis mlt = new MoreLikeThis(reader);
- Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
- mlt.setAnalyzer(analyzer);
+ mlt = this.getDefaultMoreLikeThis(reader);
mlt.setMaxQueryTerms(topN);
- mlt.setMinDocFreq(1);
- mlt.setMinTermFreq(1);
- mlt.setMinWordLen(1);
mlt.setFieldNames(new String[]{"text"});
// perform MLT query
@@ -293,13 +396,9 @@ public class TestMoreLikeThis extends LuceneTestCase {
writer.close();
// setup MLT query
- MoreLikeThis mlt = new MoreLikeThis(reader);
+ MoreLikeThis mlt = this.getDefaultMoreLikeThis(reader);
- mlt.setAnalyzer(analyzer);
mlt.setMaxQueryTerms(maxQueryTerms);
- mlt.setMinDocFreq(1);
- mlt.setMinTermFreq(1);
- mlt.setMinWordLen(1);
mlt.setFieldNames(new String[]{FOR_SALE, NOT_FOR_SALE});
// perform MLT query