You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by mh...@apache.org on 2009/07/06 21:32:55 UTC
svn commit: r791579 - in /lucene/java/trunk: CHANGES.txt
src/java/org/apache/lucene/queryParser/ComplexPhraseQueryParser.java
src/test/org/apache/lucene/queryParser/TestComplexPhraseQuery.java
Author: mharwood
Date: Mon Jul 6 19:32:54 2009
New Revision: 791579
URL: http://svn.apache.org/viewvc?rev=791579&view=rev
Log:
Initial commit of LUCENE-1486 - a subclass of the default QueryParser that overrides the parsing of PhraseQueries to allow more complex syntax e.g. wildcards in phrase queries
Added:
lucene/java/trunk/src/java/org/apache/lucene/queryParser/ComplexPhraseQueryParser.java
lucene/java/trunk/src/test/org/apache/lucene/queryParser/TestComplexPhraseQuery.java
Modified:
lucene/java/trunk/CHANGES.txt
Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=791579&r1=791578&r2=791579&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Mon Jul 6 19:32:54 2009
@@ -521,6 +521,12 @@
All standard parsers now also implement Serializable and enforce
their singleton status. (Uwe Schindler, Mike McCandless)
+31. LUCENE-1486: Added ComplexPhraseQueryParser, an extension of QueryParser
+ that allows a subset of the Lucene query language to be embedded in
+ PhraseQuerys. Wildcard, Range, and Fuzzy queries, as well as limited
+ boolean logic, can be used within quote operators with this parser, ie:
+ "(jo* -john) smyth~". (Mark Harwood via Mark Miller)
+
Optimizations
1. LUCENE-1427: Fixed QueryWrapperFilter to not waste time computing
Added: lucene/java/trunk/src/java/org/apache/lucene/queryParser/ComplexPhraseQueryParser.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/queryParser/ComplexPhraseQueryParser.java?rev=791579&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/queryParser/ComplexPhraseQueryParser.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/queryParser/ComplexPhraseQueryParser.java Mon Jul 6 19:32:54 2009
@@ -0,0 +1,386 @@
+package org.apache.lucene.queryParser;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TermRangeQuery;
+import org.apache.lucene.search.spans.SpanNearQuery;
+import org.apache.lucene.search.spans.SpanNotQuery;
+import org.apache.lucene.search.spans.SpanOrQuery;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanTermQuery;
+
+/**
+ * QueryParser which permits complex phrase query syntax e.g. "(john jon
+ * jonathan~) peters*"
+ *
+ * Performs potentially multiple passes over Query text to parse any nested
+ * logic in PhraseQueries. - First pass takes any PhraseQuery content between
+ * quotes and stores for subsequent pass. All other query content is parsed as
+ * normal - Second pass parses any stored PhraseQuery content, checking all
+ * embedded clauses are referring to the same field and therefore can be
+ * rewritten as Span queries. All PhraseQuery clauses are expressed as
+ * ComplexPhraseQuery objects
+ *
+ * This could arguably be done in one pass using a new QueryParser but here I am
+ * working within the constraints of the existing parser as a base class. This
+ * currently simply feeds all phrase content through an analyzer to select
+ * phrase terms - any "special" syntax such as * ~ * etc are not given special
+ * status
+ *
+ *
+ */
+public class ComplexPhraseQueryParser extends QueryParser {
+ private ArrayList/*<ComplexPhraseQuery>*/complexPhrases = null;
+
+ private boolean isPass2ResolvingPhrases;
+
+ private ComplexPhraseQuery currentPhraseQuery = null;
+
+ public ComplexPhraseQueryParser(String f, Analyzer a) {
+ super(f, a);
+ }
+
+ protected Query getFieldQuery(String field, String queryText, int slop) {
+ ComplexPhraseQuery cpq = new ComplexPhraseQuery(field, queryText, slop);
+ complexPhrases.add(cpq); // add to list of phrases to be parsed once
+ // we
+ // are through with this pass
+ return cpq;
+ }
+
+ public Query parse(String query) throws ParseException {
+ if (isPass2ResolvingPhrases) {
+ boolean oldConstantScoreRewriteSetting = getConstantScoreRewrite();
+ try {
+ // Temporarily set constantScoreRewrite to false so that Parser will
+ // generate visible
+ // collection of terms which we can convert into SpanQueries.
+ // ConstantScoreRewrite mode produces an
+ // opaque ConstantScoreQuery object which cannot be interrogated for
+ // terms in the same way a BooleanQuery can.
+ // QueryParser is not guaranteed threadsafe anyway so this temporary
+ // state change should not
+ // present an issue
+ setConstantScoreRewrite(false);
+ return super.parse(query);
+ } finally {
+ setConstantScoreRewrite(oldConstantScoreRewriteSetting);
+ }
+ }
+
+ // First pass - parse the top-level query recording any PhraseQuerys
+ // which will need to be resolved
+ complexPhrases = new ArrayList/*<ComplexPhraseQuery>*/();
+ Query q = super.parse(query);
+
+ // Perform second pass, using this QueryParser to parse any nested
+ // PhraseQueries with different
+ // set of syntax restrictions (i.e. all fields must be same)
+ isPass2ResolvingPhrases = true;
+ try {
+ for (Iterator iterator = complexPhrases.iterator(); iterator.hasNext();) {
+ currentPhraseQuery = (ComplexPhraseQuery) iterator.next();
+ // in each phrase, now parse the contents between quotes as a
+ // separate parse operation
+ currentPhraseQuery.parsePhraseElements(this);
+ }
+ } finally {
+ isPass2ResolvingPhrases = false;
+ }
+ return q;
+ }
+
+ // There is No "getTermQuery throws ParseException" method to override so
+ // unfortunately need
+ // to throw a runtime exception here if a term for another field is embedded
+ // in phrase query
+ protected Query newTermQuery(Term term) {
+ if (isPass2ResolvingPhrases) {
+ try {
+ checkPhraseClauseIsForSameField(term.field());
+ } catch (ParseException pe) {
+ throw new RuntimeException("Error parsing complex phrase", pe);
+ }
+ }
+ return super.newTermQuery(term);
+ }
+
+ // Helper method used to report on any clauses that appear in query syntax
+ private void checkPhraseClauseIsForSameField(String field)
+ throws ParseException {
+ if (!field.equals(currentPhraseQuery.field)) {
+ throw new ParseException("Cannot have clause for field \"" + field
+ + "\" nested in phrase " + " for field \"" + currentPhraseQuery.field
+ + "\"");
+ }
+ }
+
+ protected Query getWildcardQuery(String field, String termStr)
+ throws ParseException {
+ if (isPass2ResolvingPhrases) {
+ checkPhraseClauseIsForSameField(field);
+ }
+ return super.getWildcardQuery(field, termStr);
+ }
+
+ protected Query getRangeQuery(String field, String part1, String part2,
+ boolean inclusive) throws ParseException {
+ if (isPass2ResolvingPhrases) {
+ checkPhraseClauseIsForSameField(field);
+ }
+ return super.getRangeQuery(field, part1, part2, inclusive);
+ }
+
+ protected Query newRangeQuery(String field, String part1, String part2,
+ boolean inclusive) {
+ if (isPass2ResolvingPhrases) {
+ // Must use old-style RangeQuery in order to produce a BooleanQuery
+ // that can be turned into SpanOr clause
+ TermRangeQuery rangeQuery = new TermRangeQuery(field, part1, part2, inclusive, inclusive,
+ getRangeCollator());
+ rangeQuery.setConstantScoreRewrite(false);;
+ return rangeQuery;
+ }
+ return super.newRangeQuery(field, part1, part2, inclusive);
+ }
+
+ protected Query getFuzzyQuery(String field, String termStr,
+ float minSimilarity) throws ParseException {
+ if (isPass2ResolvingPhrases) {
+ checkPhraseClauseIsForSameField(field);
+ }
+ return super.getFuzzyQuery(field, termStr, minSimilarity);
+ }
+
+ /*
+ * Used to handle the query content in between quotes and produced Span-based
+ * interpretations of the clauses.
+ */
+ static class ComplexPhraseQuery extends Query {
+
+ String field;
+
+ String phrasedQueryStringContents;
+
+ int slopFactor;
+
+ private Query contents;
+
+ public ComplexPhraseQuery(String field, String phrasedQueryStringContents,
+ int slopFactor) {
+ super();
+ this.field = field;
+ this.phrasedQueryStringContents = phrasedQueryStringContents;
+ this.slopFactor = slopFactor;
+ }
+
+ // Called by ComplexPhraseQueryParser for each phrase after the main
+ // parse
+ // thread is through
+ protected void parsePhraseElements(QueryParser qp) throws ParseException {
+ // TODO ensure that field-sensitivity is preserved ie the query
+ // string below is parsed as
+ // field+":("+phrasedQueryStringContents+")"
+ // but this will need code in rewrite to unwrap the first layer of
+ // boolean query
+ contents = qp.parse(phrasedQueryStringContents);
+ }
+
+ public Query rewrite(IndexReader reader) throws IOException {
+ // ArrayList spanClauses = new ArrayList();
+ if (contents instanceof TermQuery) {
+ return contents;
+ }
+ // Build a sequence of Span clauses arranged in a SpanNear - child
+ // clauses can be complex
+ // Booleans e.g. nots and ors etc
+ int numNegatives = 0;
+ if (!(contents instanceof BooleanQuery)) {
+ throw new IllegalArgumentException("Unknown query type \""
+ + contents.getClass().getName()
+ + "\" found in phrase query string \"" + phrasedQueryStringContents
+ + "\"");
+ }
+ BooleanQuery bq = (BooleanQuery) contents;
+ BooleanClause[] bclauses = bq.getClauses();
+ SpanQuery[] allSpanClauses = new SpanQuery[bclauses.length];
+ // For all clauses e.g. one* two~
+ for (int i = 0; i < bclauses.length; i++) {
+ // HashSet bclauseterms=new HashSet();
+ Query qc = bclauses[i].getQuery();
+ // Rewrite this clause e.g one* becomes (one OR onerous)
+ qc = qc.rewrite(reader);
+ if (bclauses[i].getOccur().equals(BooleanClause.Occur.MUST_NOT)) {
+ numNegatives++;
+ }
+
+ if (qc instanceof BooleanQuery) {
+ ArrayList sc = new ArrayList();
+ addComplexPhraseClause(sc, (BooleanQuery) qc);
+ if (sc.size() > 0) {
+ allSpanClauses[i] = (SpanQuery) sc.get(0);
+ } else {
+ // Insert fake term e.g. phrase query was for "Fred Smithe*" and
+ // there were no "Smithe*" terms - need to
+ // prevent match on just "Fred".
+ allSpanClauses[i] = new SpanTermQuery(new Term(field,
+ "Dummy clause because no terms found - must match nothing"));
+ }
+ } else {
+ if (qc instanceof TermQuery) {
+ TermQuery tq = (TermQuery) qc;
+ allSpanClauses[i] = new SpanTermQuery(tq.getTerm());
+ } else {
+ throw new IllegalArgumentException("Unknown query type \""
+ + qc.getClass().getName()
+ + "\" found in phrase query string \""
+ + phrasedQueryStringContents + "\"");
+ }
+
+ }
+ }
+ if (numNegatives == 0) {
+ // The simple case - no negative elements in phrase
+ return new SpanNearQuery(allSpanClauses, slopFactor, true);
+ }
+ // Complex case - we have mixed positives and negatives in the
+ // sequence.
+ // Need to return a SpanNotQuery
+ ArrayList positiveClauses = new ArrayList();
+ for (int j = 0; j < allSpanClauses.length; j++) {
+ if (!bclauses[j].getOccur().equals(BooleanClause.Occur.MUST_NOT)) {
+ positiveClauses.add(allSpanClauses[j]);
+ }
+ }
+
+ SpanQuery[] includeClauses = (SpanQuery[]) positiveClauses
+ .toArray(new SpanQuery[positiveClauses.size()]);
+
+ SpanQuery include = null;
+ if (includeClauses.length == 1) {
+ include = includeClauses[0]; // only one positive clause
+ } else {
+ // need to increase slop factor based on gaps introduced by
+ // negatives
+ include = new SpanNearQuery(includeClauses, slopFactor + numNegatives,
+ true);
+ }
+ // Use sequence of positive and negative values as the exclude.
+ SpanNearQuery exclude = new SpanNearQuery(allSpanClauses, slopFactor,
+ true);
+ SpanNotQuery snot = new SpanNotQuery(include, exclude);
+ return snot;
+ }
+
+ private void addComplexPhraseClause(List spanClauses, BooleanQuery qc) {
+ ArrayList ors = new ArrayList();
+ ArrayList nots = new ArrayList();
+ BooleanClause[] bclauses = qc.getClauses();
+
+ // For all clauses e.g. one* two~
+ for (int i = 0; i < bclauses.length; i++) {
+ Query childQuery = bclauses[i].getQuery();
+
+ // select the list to which we will add these options
+ ArrayList chosenList = ors;
+ if (bclauses[i].getOccur() == BooleanClause.Occur.MUST_NOT) {
+ chosenList = nots;
+ }
+
+ if (childQuery instanceof TermQuery) {
+ TermQuery tq = (TermQuery) childQuery;
+ SpanTermQuery stq = new SpanTermQuery(tq.getTerm());
+ stq.setBoost(tq.getBoost());
+ chosenList.add(stq);
+ } else if (childQuery instanceof BooleanQuery) {
+ BooleanQuery cbq = (BooleanQuery) childQuery;
+ addComplexPhraseClause(chosenList, cbq);
+ } else {
+ // TODO alternatively could call extract terms here?
+ throw new IllegalArgumentException("Unknown query type:"
+ + childQuery.getClass().getName());
+ }
+ }
+ if (ors.size() == 0) {
+ return;
+ }
+ SpanOrQuery soq = new SpanOrQuery((SpanQuery[]) ors
+ .toArray(new SpanQuery[ors.size()]));
+ if (nots.size() == 0) {
+ spanClauses.add(soq);
+ } else {
+ SpanOrQuery snqs = new SpanOrQuery((SpanQuery[]) nots
+ .toArray(new SpanQuery[nots.size()]));
+ SpanNotQuery snq = new SpanNotQuery(soq, snqs);
+ spanClauses.add(snq);
+ }
+ }
+
+ public String toString(String field) {
+ return "\"" + phrasedQueryStringContents + "\"";
+ }
+
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + ((field == null) ? 0 : field.hashCode());
+ result = prime
+ * result
+ + ((phrasedQueryStringContents == null) ? 0
+ : phrasedQueryStringContents.hashCode());
+ result = prime * result + slopFactor;
+ return result;
+ }
+
+ public boolean equals(Object obj) {
+ if (this == obj)
+ return true;
+ if (obj == null)
+ return false;
+ if (getClass() != obj.getClass())
+ return false;
+ ComplexPhraseQuery other = (ComplexPhraseQuery) obj;
+ if (field == null) {
+ if (other.field != null)
+ return false;
+ } else if (!field.equals(other.field))
+ return false;
+ if (phrasedQueryStringContents == null) {
+ if (other.phrasedQueryStringContents != null)
+ return false;
+ } else if (!phrasedQueryStringContents
+ .equals(other.phrasedQueryStringContents))
+ return false;
+ if (slopFactor != other.slopFactor)
+ return false;
+ return true;
+ }
+ }
+}
Added: lucene/java/trunk/src/test/org/apache/lucene/queryParser/TestComplexPhraseQuery.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/queryParser/TestComplexPhraseQuery.java?rev=791579&view=auto
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/queryParser/TestComplexPhraseQuery.java (added)
+++ lucene/java/trunk/src/test/org/apache/lucene/queryParser/TestComplexPhraseQuery.java Mon Jul 6 19:32:54 2009
@@ -0,0 +1,143 @@
+package org.apache.lucene.queryParser;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.HashSet;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriter.MaxFieldLength;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.store.RAMDirectory;
+
+public class TestComplexPhraseQuery extends TestCase {
+
+ Analyzer analyzer = new StandardAnalyzer();
+
+ DocData docsContent[] = { new DocData("john smith", "1"),
+ new DocData("johathon smith", "2"),
+ new DocData("john percival smith", "3"),
+ new DocData("jackson waits tom", "4") };
+
+ private IndexSearcher searcher;
+
+ String defaultFieldName = "name";
+
+ public void testComplexPhrases() throws Exception {
+ checkMatches("\"john smith\"", "1"); // Simple multi-term still works
+ checkMatches("\"j* smyth~\"", "1,2"); // wildcards and fuzzies are OK in
+ // phrases
+ checkMatches("\"(jo* -john) smith\"", "2"); // boolean logic works
+ checkMatches("\"jo* smith\"~2", "1,2,3"); // position logic works.
+ checkMatches("\"jo* [sma TO smZ]\" ", "1,2"); // range queries supported
+ checkMatches("\"john\"", "1,3"); // Simple single-term still works
+ checkMatches("\"(john OR johathon) smith\"", "1,2"); // boolean logic with
+ // brackets works.
+ checkMatches("\"(jo* -john) smyth~\"", "2"); // boolean logic with
+ // brackets works.
+
+ // checkMatches("\"john -percival\"", "1"); // not logic doesn't work
+ // currently :(.
+
+ checkMatches("\"john nosuchword*\"", ""); // phrases with clauses producing
+ // empty sets
+
+ checkBadQuery("\"jo* id:1 smith\""); // mixing fields in a phrase is bad
+ checkBadQuery("\"jo* \"smith\" \""); // phrases inside phrases is bad
+ }
+
+ private void checkBadQuery(String qString) {
+ QueryParser qp = new ComplexPhraseQueryParser(defaultFieldName, analyzer);
+ Throwable expected = null;
+ try {
+ qp.parse(qString);
+ } catch (Throwable e) {
+ expected = e;
+ }
+ assertNotNull("Expected parse error in " + qString, expected);
+
+ }
+
+ private void checkMatches(String qString, String expectedVals)
+ throws Exception {
+ QueryParser qp = new ComplexPhraseQueryParser(defaultFieldName, analyzer);
+ qp.setFuzzyPrefixLength(1); // usually a good idea
+
+ Query q = qp.parse(qString);
+
+ HashSet expecteds = new HashSet();
+ String[] vals = expectedVals.split(",");
+ for (int i = 0; i < vals.length; i++) {
+ if (vals[i].length() > 0)
+ expecteds.add(vals[i]);
+ }
+
+ TopDocs td = searcher.search(q, 10);
+ ScoreDoc[] sd = td.scoreDocs;
+ for (int i = 0; i < sd.length; i++) {
+ Document doc = searcher.doc(sd[i].doc);
+ String id = doc.get("id");
+ assertTrue(qString + "matched doc#" + id + " not expected", expecteds
+ .contains(id));
+ expecteds.remove(id);
+ }
+
+ assertEquals(qString + " missing some matches ", 0, expecteds.size());
+
+ }
+
+ protected void setUp() throws Exception {
+ RAMDirectory rd = new RAMDirectory();
+ IndexWriter w = new IndexWriter(rd, analyzer, MaxFieldLength.UNLIMITED);
+ for (int i = 0; i < docsContent.length; i++) {
+ Document doc = new Document();
+ doc.add(new Field("name", docsContent[i].name, Field.Store.YES,
+ Field.Index.ANALYZED));
+ doc.add(new Field("id", docsContent[i].id, Field.Store.YES,
+ Field.Index.ANALYZED));
+ w.addDocument(doc);
+ }
+ w.close();
+ searcher = new IndexSearcher(rd);
+ }
+
+ protected void tearDown() throws Exception {
+ searcher.close();
+ }
+
+ static class DocData {
+ String name;
+
+ String id;
+
+ public DocData(String name, String id) {
+ super();
+ this.name = name;
+ this.id = id;
+ }
+ }
+
+}