You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by er...@apache.org on 2011/11/25 17:54:10 UTC
svn commit: r1206258 - in /lucene/dev/branches/branch_3x: ./ lucene/
lucene/backwards/src/test-framework/ lucene/backwards/src/test/
lucene/src/java/org/apache/lucene/queryParser/ solr/
solr/core/src/java/org/apache/solr/schema/ solr/core/src/java/org/...
Author: erick
Date: Fri Nov 25 16:54:07 2011
New Revision: 1206258
URL: http://svn.apache.org/viewvc?rev=1206258&view=rev
Log:
SOLR-2438 allowing "multiterm" entry in the schema analysis chain, synthesizing one from the existing query chain if not present
Added:
lucene/dev/branches/branch_3x/solr/core/src/test-files/solr/conf/schema-folding.xml
lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/schema/MultiTermTest.java
lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/search/TestFoldingMultitermQuery.java
Modified:
lucene/dev/branches/branch_3x/ (props changed)
lucene/dev/branches/branch_3x/lucene/ (props changed)
lucene/dev/branches/branch_3x/lucene/backwards/src/test/ (props changed)
lucene/dev/branches/branch_3x/lucene/backwards/src/test-framework/ (props changed)
lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/queryParser/QueryParser.java
lucene/dev/branches/branch_3x/solr/ (props changed)
lucene/dev/branches/branch_3x/solr/CHANGES.txt
lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/schema/FieldProperties.java
lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/schema/FieldType.java
lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/schema/IndexSchema.java
lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/schema/SchemaField.java
lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/schema/TextField.java
lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/search/SolrQueryParser.java
lucene/dev/branches/branch_3x/solr/example/solr/conf/schema.xml
lucene/dev/branches/branch_3x/solr/solrj/ (props changed)
Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/queryParser/QueryParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/queryParser/QueryParser.java?rev=1206258&r1=1206257&r2=1206258&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/queryParser/QueryParser.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/queryParser/QueryParser.java Fri Nov 25 16:54:07 2011
@@ -33,6 +33,7 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.WildcardQuery;
+import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;
import org.apache.lucene.util.VirtualMethod;
@@ -875,6 +876,41 @@ public class QueryParser implements Quer
return new FuzzyQuery(term,minimumSimilarity,prefixLength);
}
+ protected String analyzeMultitermTerm(String field, String part, Analyzer analyzerIn) {
+ TokenStream source;
+
+ if (analyzerIn == null) analyzerIn = analyzer;
+
+ try {
+ source = analyzerIn.tokenStream(field, new StringReader(part));
+ source.reset();
+ } catch (IOException e) {
+ throw new RuntimeException("Unable to initialize TokenStream to analyze multiTerm term: " + part, e);
+ }
+
+ CharTermAttribute termAtt = source.getAttribute(CharTermAttribute.class);
+ String termRet = "";
+
+ try {
+ if (!source.incrementToken())
+ throw new IllegalArgumentException("analyzer returned no terms for multiTerm term: " + part);
+ termRet = termAtt.toString();
+ if (source.incrementToken())
+ throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + part);
+ } catch (IOException e) {
+ throw new RuntimeException("error analyzing range part: " + part, e);
+ }
+
+ try {
+ source.end();
+ source.close();
+ } catch (IOException e) {
+ throw new RuntimeException("Unable to end & close TokenStream after analyzing multiTerm term: " + part, e);
+ }
+
+ return termRet;
+ }
+
/**
* Builds a new TermRangeQuery instance
* @param field Field
Modified: lucene/dev/branches/branch_3x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/CHANGES.txt?rev=1206258&r1=1206257&r2=1206258&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/solr/CHANGES.txt Fri Nov 25 16:54:07 2011
@@ -30,6 +30,10 @@ New Features
* SOLR-1565: StreamingUpdateSolrServer supports RequestWriter API and therefore, javabin update
format (shalin)
+
+* SOLR-2438: Case insensitive search for wildcard queries. Actually, the ability to specify
+ a complete analysis chain for multiterm queries.
+ (Pete Sturge Erick Erickson, Mentoring from Seeley and Muir)
Bug Fixes
----------------------
@@ -54,6 +58,340 @@ Upgrading from Solr 3.4
New Features
----------------------
+
+* SOLR-571: The autowarmCount for LRUCaches (LRUCache and FastLRUCache) now
+ supports "percentages" which get evaluated relative the current size of
+ the cache when warming happens.
+ (Tomas Fernandez Lobbe and hossman)
+
+* SOLR-1932: New relevancy function queries: termfreq, tf, docfreq, idf
+ norm, maxdoc, numdocs. (yonik)
+
+* SOLR-1665: Add debug component options for timings, results and query info only (gsingers, hossman, yonik)
+
+* SOLR-2001: The query component will substitute an empty query that matches
+ no documents if the query parser returns null. This also prevents an
+ exception from being thrown by the default parser if "q" is missing. (yonik)
+
+* SOLR-2112: Solrj API now supports streaming results. (ryan)
+
+* SOLR-792: Adding PivotFacetComponent for Hierarchical faceting
+ (erik, Jeremy Hinegardner, Thibaut Lassalle, ryan)
+
+* LUCENE-2507, SOLR-2571, SOLR-2576: Added DirectSolrSpellChecker, which uses Lucene's
+ DirectSpellChecker to retrieve correction candidates directly from the term dictionary using
+ levenshtein automata. (James Dyer, rmuir)
+
+* SOLR-1873: SolrCloud - added shared/central config and core/shard management via zookeeper,
+ built-in load balancing, and infrastructure for future SolrCloud work. (yonik, Mark Miller)
+ Additional Work:
+ SOLR-2324: SolrCloud solr.xml parameters are not persisted by CoreContainer.
+ (Massimo Schiavon, Mark Miller)
+ SOLR-2799: Update CloudState incrementally rather than always reading the data at each zk
+ node. (Jamie Johnson via Mark Miller)
+
+* SOLR-1729: Evaluation of NOW for date math is done only once per request for
+ consistency, and is also propagated to shards in distributed search.
+ Adding a parameter NOW=<time_in_ms> to the request will override the
+ current time. (Peter Sturge, yonik)
+
+* SOLR-1566: Transforming documents in the ResponseWriters. This will allow
+ for more complex results in responses and open the door for function queries
+ as results.
+ (ryan with patches from grant, noble, cmale, yonik, Jan Høydahl,
+ Arul Kalaipandian, hossman)
+ SOLR-2037: Thanks to SOLR-1566, documents boosted by the QueryElevationComponent
+ can be marked as boosted. (gsingers, ryan, yonik)
+
+* SOLR-2396: Add CollationField, which is much more efficient than
+ the Solr 3.x CollationKeyFilterFactory, and also supports
+ Locale-sensitive range queries. (rmuir)
+
+* SOLR-2338: Add support for using <similarity/> in a schema's fieldType,
+ for customizing scoring on a per-field basis. (hossman, yonik, rmuir)
+
+* SOLR-2335: New 'field("...")' function syntax for refering to complex
+ field names (containing whitespace or special characters) in functions.
+
+* SOLR-1709: Distributed support for Date and Numeric Range Faceting
+ (Peter Sturge, David Smiley, hossman)
+
+* SOLR-2383: /browse improvements: generalize range and date facet display
+ (Jan Høydahl via yonik)
+
+* SOLR-2272: Pseudo-join queries / filters. Examples:
+ To restrict to the set of parents with at least one blue-eyed child:
+ fq={!join from=parent to=name}eyes:blue
+ To restrict to the set of children with at least one blue-eyed parent:
+ fq={!join from=name to=parent}eyes:blue
+ (yonik)
+
+* SOLR-1942: Added the ability to select postings format per fieldType in schema.xml
+ as well as support custom Codecs in solrconfig.xml.
+ (simonw via rmuir)
+
+* SOLR-2136: Boolean type added to function queries, along with
+ new functions exists(), if(), and(), or(), xor(), not(), def(),
+ and true and false constants. (yonik)
+
+* SOLR-2491: Add support for using spellcheck collation in conjunction
+ with grouping. Note that the number of hits returned for collations
+ is the number of ungrouped hits. (James Dyer via rmuir)
+
+* SOLR-1298: Return FunctionQuery as pseudo field. The solr 'fl' param
+ now supports functions. For example: fl=id,sum(x,y) -- NOTE: only
+ functions with fast random access are reccomended. (yonik, ryan)
+
+* SOLR-705: Optionally return shard info with each document in distributed
+ search. Use fl=id,[shard] to return the shard url. (ryan)
+
+* SOLR-2417: Add explain info directly to return documents using
+ ?fl=id,[explain] (ryan)
+
+* SOLR-2533: Converted ValueSource.ValueSourceSortField over to new rewriteable Lucene
+ SortFields. ValueSourceSortField instances must be rewritten before they can be used.
+ This is done by SolrIndexSearcher when necessary. (Chris Male).
+
+* SOLR-2193, SOLR-2565: You may now specify a 'soft' commit when committing. This will
+ use Lucene's NRT feature to avoid guaranteeing documents are on stable storage in exchange
+ for faster reopen times. There is also a new 'soft' autocommit tracker that can be
+ configured. (Mark Miller, Robert Muir)
+
+* SOLR-2399: Updated Solr Admin interface. New look and feel with per core administration
+ and many new options. (Stefan Matheis via ryan)
+
+* SOLR-1032: CSV handler now supports "literal.field_name=value" parameters.
+ (Simon Rosenthal, ehatcher)
+
+* SOLR-2656: realtime-get, efficiently retrieves the latest stored fields for specified
+ documents, even if they are not yet searchable (i.e. without reopening a searcher)
+ (yonik)
+
+* SOLR-2703: Added support for Lucene's "surround" query parser. (Simon Rosenthal, ehatcher)
+
+* SOLR-2754: Added factories for several ranking algorithms:
+ BM25SimilarityFactory: Okapi BM25
+ DFRSimilarityFactory: Divergence from Randomness models
+ IBSimilarityFactory: Information-based models
+ LMDirichletSimilarity: LM with Dirichlet smoothing
+ LMJelinekMercerSimilarity: LM with Jelinek-Mercer smoothing
+ (David Mark Nemeskey, Robert Muir)
+
+* SOLR-2134 Trie* fields should support sortMissingLast=true, and deprecate Sortable* Field Types
+ (Ryan McKinley, Mike McCandless, Uwe Schindler, Erick Erickson)
+
+* SOLR-2438: Case insensitive search for wildcard queries. Actually, the ability to specify
+ a complete analysis chain for multiterm queries.
+ (Pete Sturge Erick Erickson, Mentoring from Seeley and Muir)
+
+
+Optimizations
+----------------------
+
+* SOLR-1875: Per-segment field faceting for single valued string fields.
+ Enable with facet.method=fcs, control the number of threads used with
+ the "threads" local param on the facet.field param. This algorithm will
+ only be faster in the presence of rapid index changes. (yonik)
+
+* SOLR-1904: When facet.enum.cache.minDf > 0 and the base doc set is a
+ SortedIntSet, convert to HashDocSet for better performance. (yonik)
+
+* SOLR-1843: A new "rootName" attribute is now available when
+ configuring <jmx/> in solrconfig.xml. If this attribute is set,
+ Solr will use it as the root name for all MBeans Solr exposes via
+ JMX. The default root name is "solr" followed by the core name.
+ (Constantijn Visinescu, hossman)
+
+* SOLR-2092: Speed up single-valued and multi-valued "fc" faceting. Typical
+ improvement is 5%, but can be much greater (up to 10x faster) when facet.offset
+ is very large (deep paging). (yonik)
+
+* SOLR-2193, SOLR-2565: The default Solr update handler has been improved so
+ that it uses fewer locks, keeps the IndexWriter open rather than closing it
+ on each commit (ie commits no longer wait for background merges to complete),
+ works with SolrCore to provide faster 'soft' commits, and has an improved API
+ that requires less instanceof special casing. (Mark Miller, Robert Muir)
+ Additional Work:
+ SOLR-2697: commit and autocommit operations don't reset
+ DirectUpdateHandler2.numDocsPending stats attribute.
+ (Alexey Serba, Mark Miller)
+
+Bug Fixes
+----------------------
+
+* SOLR-2762: FSTLookup could return duplicate results or one results less
+ than requested. (David Smiley, Dawid Weiss)
+
+* SOLR-2741: Bugs in facet range display in trunk (janhoy)
+
+* SOLR-1908: Fixed SignatureUpdateProcessor to fail to initialize on
+ invalid config. Specifically: a signatureField that does not exist,
+ or overwriteDupes=true with a signatureField that is not indexed.
+ (hossman)
+
+* SOLR-1824: IndexSchema will now fail to initialize if there is a
+ problem initializing one of the fields or field types. (hossman)
+
+* SOLR-1928: TermsComponent didn't correctly break ties for non-text
+ fields sorted by count. (yonik)
+
+* SOLR-2107: MoreLikeThisHandler doesn't work with alternate qparsers. (yonik)
+
+* SOLR-2108: Fixed false positives when using wildcard queries on fields with reversed
+ wildcard support. For example, a query of *zemog* would match documents that contain
+ 'gomez'. (Landon Kuhn via Robert Muir)
+
+* SOLR-1962: SolrCore#initIndex should not use a mix of indexPath and newIndexPath (Mark Miller)
+
+* SOLR-2275: fix DisMax 'mm' parsing to be tolerant of whitespace
+ (Erick Erickson via hossman)
+
+* SOLR-2193, SOLR-2565, SOLR-2651: SolrCores now properly share IndexWriters across SolrCore reloads.
+ (Mark Miller, Robert Muir)
+ Additional Work:
+ SOLR-2705: On reload, IndexWriterProvider holds onto the initial SolrCore it was created with.
+ (Yury Kats, Mark Miller)
+
+* SOLR-2682: Remove addException() in SimpleFacet. FacetComponent no longer catches and embeds
+ exceptions occurred during facet processing, it throws HTTP 400 or 500 exceptions instead. (koji)
+
+* SOLR-2654: Directorys used by a SolrCore are now closed when they are no longer used.
+ (Mark Miller)
+
+* SOLR-2854: Now load URL content stream data (via stream.url) when called for during request handling,
+ rather than loading URL content streams automatically regardless of use.
+ (David Smiley and Ryan McKinley via ehatcher)
+
+* SOLR-2829: Fix problem with false-positives due to incorrect
+ equals methods. (Yonik Seeley, Hossman, Erick Erickson.
+ Marc Tinnemeyer caught the bug)
+
+* SOLR-2848: Removed 'instanceof AbstractLuceneSpellChecker' hacks from distributed spellchecking code,
+ and added a merge() method to SolrSpellChecker instead. Previously if you extended SolrSpellChecker
+ your spellchecker would not work in distributed fashion. (James Dyer via rmuir)
+
+Other Changes
+----------------------
+
+* SOLR-1846: Eliminate support for the abortOnConfigurationError
+ option. It has never worked very well, and in recent versions of
+ Solr hasn't worked at all. (hossman)
+
+* SOLR-1889: The default logic for the 'mm' param of DismaxQParser and
+ ExtendedDismaxQParser has been changed to be determined based on the
+ effective value of the 'q.op' param (hossman)
+
+* SOLR-1946: Misc improvements to the SystemInfoHandler: /admin/system
+ (hossman)
+
+* SOLR-2289: Tweak spatial coords for example docs so they are a bit
+ more spread out (Erick Erickson via hossman)
+
+* SOLR-2288: Small tweaks to eliminate compiler warnings. primarily
+ using Generics where applicable in method/object declatations, and
+ adding @SuppressWarnings("unchecked") when appropriate (hossman)
+
+* SOLR-2375: Suggester Lookup implementations now store trie data
+ and load it back on init. This means that large tries don't have to be
+ rebuilt on every commit or core reload. (ab)
+
+* SOLR-2413: Support for returning multi-valued fields w/o <arr> tag
+ in the XMLResponseWriter was removed. XMLResponseWriter only
+ no longer work with values less then 2.2 (ryan)
+
+* SOLR-2423: FieldType argument changed from String to Object
+ Conversion from SolrInputDocument > Object > Fieldable is now managed
+ by FieldType rather then DocumentBuilder. (ryan)
+
+* SOLR-2461: QuerySenderListener and AbstractSolrEventListener are
+ now public (hossman)
+
+* LUCENE-2995: Moved some spellchecker and suggest APIs to modules/suggest:
+ HighFrequencyDictionary, SortedIterator, TermFreqIterator, and the
+ suggester APIs and implementations. (rmuir)
+
+* SOLR-2576: Remove deprecated SpellingResult.add(Token, int).
+ (James Dyer via rmuir)
+
+* LUCENE-3232: Moved MutableValue classes to new 'common' module. (Chris Male)
+
+* LUCENE-2883: FunctionQuery, DocValues (and its impls), ValueSource (and its
+ impls) and BoostedQuery have been consolidated into the queries module. They
+ can now be found at o.a.l.queries.function.
+
+* SOLR-2027: FacetField.getValues() now returns an empty list if there are no
+ values, instead of null (Chris Male)
+
+* SOLR-1825: SolrQuery.addFacetQuery now enables facets automatically, like
+ addFacetField (Chris Male)
+
+* SOLR-2663: FieldTypePluginLoader has been refactored out of IndexSchema
+ and made public. (hossman)
+
+* SOLR-2331,SOLR-2691: Refactor CoreContainer's SolrXML serialization code and improve testing
+ (Yury Kats, hossman, Mark Miller)
+
+* SOLR-2698: Enhance CoreAdmin STATUS command to return index size.
+ (Yury Kats, hossman, Mark Miller)
+
+* SOLR-2654: The same Directory instance is now always used across a SolrCore so that
+ it's easier to add other DirectoryFactory's without static caching hacks.
+ (Mark Miller)
+
+* LUCENE-3286: 'luke' ant target has been disabled due to incompatibilities with XML
+ queryparser location (Chris Male)
+
+* SOLR-1897: The data dir from the core descriptor should override the data dir from
+ the solrconfig.xml rather than the other way round. (Mark Miller)
+
+* SOLR-2756: Maven configuration: Excluded transitive stax:stax-api dependency
+ from org.codehaus.woodstox:wstx-asl dependency. (David Smiley via Steve Rowe)
+
+* SOLR-2588: Moved VelocityResponseWriter back to contrib module in order to
+ remove it as a mandatory core dependency. (ehatcher)
+
+* SOLR-2718: Add ability to lazy load response writers, defined with startup="lazy".
+ (ehatcher)
+
+* SOLR-2862: More explicit lexical resources location logged if Carrot2 clustering
+ extension is used. Fixed solr. impl. of IResource and IResourceLookup. (Dawid Weiss)
+
+* SOLR-1123: Changed JSONResponseWriter to now use application/json as its Content-Type
+ by default. However the Content-Type can be overwritten and is set to text/plain in
+ the example configuration. (Uri Boness, Chris Male)
+
+* SOLR-2607: Removed deprecated client/ruby directory, which included solr-ruby and flare.
+ (ehatcher)
+
+Documentation
+----------------------
+
+* SOLR-2232: Improved README info on solr.solr.home in examples
+ (Eric Pugh and hossman)
+
+================== 3.6.0 ==================
+
+New Features
+----------------------
+* SOLR-2904: BinaryUpdateRequestHandler should be able to accept multiple update requests from
+ a stream (shalin)
+
+* SOLR-1565: StreamingUpdateSolrServer supports RequestWriter API and therefore, javabin update
+ format (shalin)
+
+* SOLR-2438: Case insensitive search for wildcard queries. Actually, the ability to specify
+ a complete analysis chain for multiterm queries.
+ (Pete Sturge Erick Erickson, Mentoring from Seeley and Muir)
+
+
+Bug Fixes
+----------------------
+* SOLR-2912: Fixed File descriptor leak in ShowFileRequestHandler (Michael Ryan, shalin)
+
+================== 3.5.0 ==================
+
+New Features
+----------------------
* SOLR-2749: Add boundary scanners for FastVectorHighlighter. <boundaryScanner/>
can be specified with a name in solrconfig.xml, and use hl.boundaryScanner=name
parameter to specify the named <boundaryScanner/>. (koji)
Modified: lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/schema/FieldProperties.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/schema/FieldProperties.java?rev=1206258&r1=1206257&r2=1206258&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/schema/FieldProperties.java (original)
+++ lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/schema/FieldProperties.java Fri Nov 25 16:54:07 2011
@@ -48,13 +48,15 @@ public abstract class FieldProperties {
protected final static int REQUIRED = 0x00001000;
protected final static int OMIT_POSITIONS = 0x00002000;
+ protected final static int LEGACY_MULTITERM = 0x00004000;
static final String[] propertyNames = {
"indexed", "tokenized", "stored",
"binary", "omitNorms", "omitTermFreqAndPositions",
"termVectors", "termPositions", "termOffsets",
"multiValued",
- "sortMissingFirst","sortMissingLast","required", "omitPositions"
+ "sortMissingFirst","sortMissingLast","required", "omitPositions",
+ "legacyMultiTerm"
};
static final Map<String,Integer> propertyMap = new HashMap<String,Integer>();
Modified: lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/schema/FieldType.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/schema/FieldType.java?rev=1206258&r1=1206257&r2=1206258&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/schema/FieldType.java (original)
+++ lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/schema/FieldType.java Fri Nov 25 16:54:07 2011
@@ -431,6 +431,21 @@ public abstract class FieldType extends
protected Analyzer queryAnalyzer=analyzer;
/**
+ * Analyzer set by schema for text types to use when searching fields
+ * of this type, subclasses can set analyzer themselves or override
+ * getAnalyzer()
+ * This analyzer is used to process wildcard, prefix, regex and other multiterm queries. It
+ * assembles a list of tokenizer +filters that "make sense" for this, primarily accent folding and
+ * lowercasing filters, and charfilters.
+ *
+ * If users require old-style behavior, they can specify 'legacyMultiterm="true" ' in the schema file
+ * @see #getMultiTermAnalyzer
+ * @see #setMultiTermAnalyzer
+ */
+ protected Analyzer multiTermAnalyzer=null;
+
+
+ /**
* Returns the Analyzer to be used when indexing fields of this type.
* <p>
* This method may be called many times, at any time.
@@ -452,7 +467,19 @@ public abstract class FieldType extends
return queryAnalyzer;
}
- private final String analyzerError =
+ /**
+ * Returns the Analyzer to be used when searching fields of this type when mult-term queries are specified.
+ * <p>
+ * This method may be called many times, at any time.
+ * </p>
+ *
+ * @see #getAnalyzer
+ */
+ public Analyzer getMultiTermAnalyzer() {
+ return multiTermAnalyzer;
+ }
+
+ private final String analyzerError =
"FieldType: " + this.getClass().getSimpleName() +
" (" + typeName + ") does not support specifying an analyzer";
@@ -480,6 +507,28 @@ public abstract class FieldType extends
/**
* Sets the Analyzer to be used when querying fields of this type.
+ * <p/>
+ * <p>
+ * <p/>
+ * Subclasses that override this method need to ensure the behavior
+ * of the analyzer is consistent with the implementation of toInternal.
+ * </p>
+ *
+ * @see #toInternal
+ * @see #setAnalyzer
+ * @see #getQueryAnalyzer
+ */
+ public void setMultiTermAnalyzer(Analyzer analyzer) {
+ SolrException e = new SolrException
+ (ErrorCode.SERVER_ERROR,
+ "FieldType: " + this.getClass().getSimpleName() +
+ " (" + typeName + ") does not support specifying an analyzer");
+ SolrException.logOnce(log, null, e);
+ throw e;
+ }
+
+ /**
+ * Sets the Analyzer to be used when querying fields of this type.
*
* <p>
* The default implementation throws a SolrException.
@@ -500,6 +549,7 @@ public abstract class FieldType extends
throw e;
}
+
/**
* Renders the specified field as XML
*/
Modified: lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/schema/IndexSchema.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/schema/IndexSchema.java?rev=1206258&r1=1206257&r2=1206258&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/schema/IndexSchema.java (original)
+++ lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/schema/IndexSchema.java Fri Nov 25 16:54:07 2011
@@ -18,11 +18,13 @@
package org.apache.solr.schema;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.KeywordAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.util.Version;
+import org.apache.solr.analysis.*;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.SolrParams;
@@ -31,10 +33,6 @@ import org.apache.solr.common.util.Syste
import org.apache.solr.core.SolrConfig;
import org.apache.solr.core.Config;
import org.apache.solr.core.SolrResourceLoader;
-import org.apache.solr.analysis.CharFilterFactory;
-import org.apache.solr.analysis.TokenFilterFactory;
-import org.apache.solr.analysis.TokenizerChain;
-import org.apache.solr.analysis.TokenizerFactory;
import org.apache.solr.search.SolrQueryParser;
import org.apache.solr.util.plugin.AbstractPluginLoader;
import org.apache.solr.util.plugin.SolrCoreAware;
@@ -444,6 +442,11 @@ public final class IndexSchema {
Node anode = (Node)xpath.evaluate(expression, node, XPathConstants.NODE);
Analyzer queryAnalyzer = readAnalyzer(anode);
+ expression = "./analyzer[@type='multiterm']";
+ anode = (Node) xpath.evaluate(expression, node, XPathConstants.NODE);
+ Analyzer multiAnalyzer = readAnalyzer(anode);
+
+
// An analyzer without a type specified, or with type="index"
expression = "./analyzer[not(@type)] | ./analyzer[@type='index']";
anode = (Node)xpath.evaluate(expression, node, XPathConstants.NODE);
@@ -451,9 +454,17 @@ public final class IndexSchema {
if (queryAnalyzer==null) queryAnalyzer=analyzer;
if (analyzer==null) analyzer=queryAnalyzer;
+ if (multiAnalyzer == null) {
+ Boolean legacyMatch = ! solrConfig.luceneMatchVersion.onOrAfter(Version.LUCENE_36);;
+ legacyMatch = (DOMUtil.getAttr(node, "legacyMultiTerm", null) == null) ? legacyMatch :
+ Boolean.parseBoolean(DOMUtil.getAttr(node, "legacyMultiTerm", null));
+ multiAnalyzer = constructMultiTermAnalyzer(queryAnalyzer, legacyMatch);
+ }
+
if (analyzer!=null) {
ft.setAnalyzer(analyzer);
ft.setQueryAnalyzer(queryAnalyzer);
+ ft.setMultiTermAnalyzer(multiAnalyzer);
}
if (ft instanceof SchemaAware){
schemaAware.add((SchemaAware) ft);
@@ -697,6 +708,42 @@ public final class IndexSchema {
}
}
+ // The point here is that, if no multitermanalyzer was specified in the schema file, do one of several things:
+ // 1> If legacyMultiTerm == false, assemble a new analyzer composed of all of the charfilters,
+ // lowercase filters and asciifoldingfilter.
+ // 2> If letacyMultiTerm == true just construct the analyzer from a KeywordTokenizer. That should mimic current behavior.
+ // Do the same if they've specified that the old behavior is required (legacyMultiTerm="true")
+
+ private Analyzer constructMultiTermAnalyzer(Analyzer queryAnalyzer, Boolean legacyMultiTerm) {
+ if (queryAnalyzer == null) return null;
+
+ if (legacyMultiTerm || (!(queryAnalyzer instanceof TokenizerChain))) {
+ return new KeywordAnalyzer();
+ }
+
+ TokenizerChain tc = (TokenizerChain) queryAnalyzer;
+
+ // we know it'll never be longer than this unless the code below is explicitly changed
+ TokenFilterFactory[] filters = new TokenFilterFactory[2];
+ int idx = 0;
+ for (TokenFilterFactory factory : tc.getTokenFilterFactories()) {
+ if (factory instanceof LowerCaseFilterFactory) {
+ filters[idx] = new LowerCaseFilterFactory();
+ filters[idx++].init(factory.getArgs());
+ }
+ if (factory instanceof ASCIIFoldingFilterFactory) {
+ filters[idx] = new ASCIIFoldingFilterFactory();
+ filters[idx++].init(factory.getArgs());
+ }
+ }
+ WhitespaceTokenizerFactory white = new WhitespaceTokenizerFactory();
+ white.init(tc.getTokenizerFactory().getArgs());
+
+ return new TokenizerChain(tc.getCharFilterFactories(),
+ white,
+ Arrays.copyOfRange(filters, 0, idx));
+ }
+
/**
* Register one or more new Dynamic Field with the Schema.
* @param f The {@link org.apache.solr.schema.SchemaField}
Modified: lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/schema/SchemaField.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/schema/SchemaField.java?rev=1206258&r1=1206257&r2=1206258&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/schema/SchemaField.java (original)
+++ lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/schema/SchemaField.java Fri Nov 25 16:54:07 2011
@@ -99,6 +99,9 @@ public final class SchemaField extends F
boolean isTokenized() { return (properties & TOKENIZED)!=0; }
boolean isBinary() { return (properties & BINARY)!=0; }
+ boolean legacyMultiTerm() {
+ return (properties & LEGACY_MULTITERM) != 0;
+ }
public Fieldable createField(String val, float boost) {
return type.createField(this,val,boost);
Modified: lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/schema/TextField.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/schema/TextField.java?rev=1206258&r1=1206257&r2=1206258&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/schema/TextField.java (original)
+++ lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/schema/TextField.java Fri Nov 25 16:54:07 2011
@@ -98,6 +98,11 @@ public class TextField extends FieldType
this.queryAnalyzer = analyzer;
}
+ @Override
+ public void setMultiTermAnalyzer(Analyzer analyzer) {
+ this.multiTermAnalyzer = analyzer;
+ }
+
static Query parseFieldQuery(QParser parser, Analyzer analyzer, String field, String queryText) {
int phraseSlop = 0;
boolean enablePositionIncrements = true;
Modified: lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/search/SolrQueryParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/search/SolrQueryParser.java?rev=1206258&r1=1206257&r2=1206258&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/search/SolrQueryParser.java (original)
+++ lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/search/SolrQueryParser.java Fri Nov 25 16:54:07 2011
@@ -25,7 +25,6 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.*;
-import org.apache.lucene.util.Version;
import org.apache.lucene.analysis.Analyzer;
import org.apache.solr.analysis.*;
import org.apache.solr.common.SolrException;
@@ -126,6 +125,14 @@ public class SolrQueryParser extends Que
}
}
+ protected String analyzeIfMultitermTermText(String field, String part, Analyzer analyzer) {
+ if (part == null) return part;
+
+ SchemaField sf = schema.getFieldOrNull((field));
+ if (sf == null || !(sf.getType() instanceof TextField)) return part;
+ return analyzeMultitermTerm(field, part, analyzer);
+ }
+
@Override
protected Query getFieldQuery(String field, String queryText, boolean quoted) throws ParseException {
checkNullField(field);
@@ -161,6 +168,9 @@ public class SolrQueryParser extends Que
@Override
protected Query getRangeQuery(String field, String part1, String part2, boolean inclusive) throws ParseException {
checkNullField(field);
+ part1 = analyzeIfMultitermTermText(field, part1, schema.getFieldType(field).getMultiTermAnalyzer());
+ part2 = analyzeIfMultitermTermText(field, part2, schema.getFieldType(field).getMultiTermAnalyzer());
+
SchemaField sf = schema.getField(field);
return sf.getType().getRangeQuery(parser, sf,
"*".equals(part1) ? null : part1,
@@ -175,6 +185,8 @@ public class SolrQueryParser extends Que
termStr = termStr.toLowerCase();
}
+ termStr = analyzeIfMultitermTermText(field, termStr, schema.getFieldType(field).getMultiTermAnalyzer());
+
// TODO: toInternal() won't necessarily work on partial
// values, so it looks like we need a getPrefix() function
// on fieldtype? Or at the minimum, a method on fieldType
@@ -189,14 +201,14 @@ public class SolrQueryParser extends Que
PrefixQuery prefixQuery = new PrefixQuery(t);
return prefixQuery;
}
-
@Override
protected Query getWildcardQuery(String field, String termStr) throws ParseException {
// *:* -> MatchAllDocsQuery
if ("*".equals(field) && "*".equals(termStr)) {
return newMatchAllDocsQuery();
}
-
+ termStr = analyzeIfMultitermTermText(field, termStr, schema.getFieldType(field).getMultiTermAnalyzer());
+
// can we use reversed wildcards in this field?
String type = schema.getFieldType(field).getTypeName();
ReversedWildcardFilterFactory factory = leadingWildcards.get(type);
@@ -216,4 +228,5 @@ public class SolrQueryParser extends Que
}
return q;
}
+
}
Added: lucene/dev/branches/branch_3x/solr/core/src/test-files/solr/conf/schema-folding.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/core/src/test-files/solr/conf/schema-folding.xml?rev=1206258&view=auto
==============================================================================
--- lucene/dev/branches/branch_3x/solr/core/src/test-files/solr/conf/schema-folding.xml (added)
+++ lucene/dev/branches/branch_3x/solr/core/src/test-files/solr/conf/schema-folding.xml Fri Nov 25 16:54:07 2011
@@ -0,0 +1,145 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ -->
+
+<schema name="test" version="1.0">
+ <types>
+ <fieldtype name="string" class="solr.StrField" sortMissingLast="true" multiValued="false"/>
+
+ <fieldType name="text" class="solr.TextField" multiValued="false">
+ <analyzer>
+ <tokenizer class="solr.PatternTokenizerFactory" pattern="\s+"/>
+ <filter class="solr.ASCIIFoldingFilterFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <fieldType name="text_multi" class="solr.TextField" multiValued="true">
+ <analyzer>
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.ASCIIFoldingFilterFactory"/>
+ <filter class="solr.TrimFilterFactory"/>
+ </analyzer>
+ <analyzer type="multiterm"> <!-- Intentionally different to test that these are kept distinct -->
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.ASCIIFoldingFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <fieldType name="text_multi_bad" class="solr.TextField" multiValued="false">
+ <analyzer>
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.ASCIIFoldingFilterFactory"/>
+ <filter class="solr.TrimFilterFactory"/>
+ </analyzer>
+ <analyzer type="multiterm"> <!-- Intentionally different to test that these are kept distinct -->
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0"
+ catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.ASCIIFoldingFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+
+ <fieldType name="text_ws" class="solr.TextField" multiValued="true">
+ <analyzer>
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.ASCIIFoldingFilterFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <fieldType name="text_rev" class="solr.TextField" legacyMultiTerm="false">
+ <analyzer type="index">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.ASCIIFoldingFilterFactory"/>
+ <filter class="solr.ReversedWildcardFilterFactory" withOriginal="false"
+ maxPosAsterisk="1" maxPosQuestion="2" maxFractionAsterisk="0.99"
+ minTrailing="1"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.ASCIIFoldingFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <fieldType name="text_lower_tokenizer" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.LowerCaseTokenizerFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <fieldType name="text_charfilter" class="solr.TextField" multiValued="false">
+ <analyzer type="index">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.ASCIIFoldingFilterFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt"/>
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <fieldType name="text_oldstyle" class="solr.TextField" multiValued="false" legacyMultiTerm="true">
+ <analyzer>
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.ASCIIFoldingFilterFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.TrimFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="byte" class="solr.ByteField" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="short" class="solr.ShortField" omitNorms="true" positionIncrementGap="0"/>
+ <fieldtype name="boolean" class="solr.BoolField" sortMissingLast="true"/>
+ <fieldtype name="date" class="solr.TrieDateField" precisionStep="0"/>
+ </types>
+
+ <fields>
+ <field name="id" type="string" indexed="true" stored="true" required="true"/>
+ <field name="int_f" type="int"/>
+ <field name="float_f" type="float"/>
+ <field name="long_f" type="long"/>
+ <field name="double_f" type="double"/>
+ <field name="byte_f" type="byte"/>
+ <field name="short_f" type="short"/>
+ <field name="bool_f" type="boolean"/>
+ <field name="date_f" type="date"/>
+
+ <field name="content" type="text" indexed="true" stored="true"/>
+ <field name="content_ws" type="text_ws" indexed="true" stored="true"/>
+ <field name="content_rev" type="text_rev" indexed="true" stored="true"/>
+ <field name="content_multi" type="text_multi" indexed="true" stored="true"/>
+ <field name="content_lower_token" type="text_multi" indexed="true" stored="true"/>
+ <field name="content_oldstyle" type="text_oldstyle" indexed="true" stored="true"/>
+ <field name="content_charfilter" type="text_charfilter" indexed="true" stored="true"/>
+ <field name="content_multi_bad" type="text_multi_bad" indexed="true" stored="true"/>
+ </fields>
+
+ <defaultSearchField>content</defaultSearchField>
+ <uniqueKey>id</uniqueKey>
+
+</schema>
Added: lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/schema/MultiTermTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/schema/MultiTermTest.java?rev=1206258&view=auto
==============================================================================
--- lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/schema/MultiTermTest.java (added)
+++ lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/schema/MultiTermTest.java Fri Nov 25 16:54:07 2011
@@ -0,0 +1,87 @@
+package org.apache.solr.schema;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.analysis.*;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class MultiTermTest extends SolrTestCaseJ4 {
+ public String getCoreName() {
+ return "basic";
+ }
+
+ @BeforeClass
+ public static void beforeTests() throws Exception {
+ initCore("solrconfig.xml", "schema-folding.xml");
+ }
+
+ @Test
+ public void testMultiFound() {
+ SchemaField field = h.getCore().getSchema().getField("content_multi");
+ Analyzer analyzer = field.getType().getMultiTermAnalyzer();
+ assertTrue(analyzer instanceof TokenizerChain);
+ assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof WhitespaceTokenizerFactory);
+ TokenizerChain tc = (TokenizerChain) analyzer;
+ for (TokenFilterFactory factory : tc.getTokenFilterFactories()) {
+ assertTrue((factory instanceof ASCIIFoldingFilterFactory) || (factory instanceof LowerCaseFilterFactory));
+ }
+
+ analyzer = field.getType().getAnalyzer();
+ assertTrue(analyzer instanceof TokenizerChain);
+ assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof WhitespaceTokenizerFactory);
+ tc = (TokenizerChain) analyzer;
+ for (TokenFilterFactory factory : tc.getTokenFilterFactories()) {
+ assertTrue((factory instanceof ASCIIFoldingFilterFactory) || (factory instanceof TrimFilterFactory));
+ }
+
+ assertTrue(tc.getCharFilterFactories().length == 0);
+ }
+
+ @Test
+ public void testQueryCopiedToMulti() {
+ SchemaField field = h.getCore().getSchema().getField("content_charfilter");
+ Analyzer analyzer = field.getType().getMultiTermAnalyzer();
+ assertTrue(analyzer instanceof TokenizerChain);
+ assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof WhitespaceTokenizerFactory);
+ TokenizerChain tc = (TokenizerChain) analyzer;
+ for (TokenFilterFactory factory : tc.getTokenFilterFactories()) {
+ assertTrue(factory instanceof LowerCaseFilterFactory);
+ }
+
+ assertTrue(tc.getCharFilterFactories().length == 1);
+ assertTrue(tc.getCharFilterFactories()[0] instanceof MappingCharFilterFactory);
+ }
+
+ @Test
+ public void testDefaultCopiedToMulti() {
+ SchemaField field = h.getCore().getSchema().getField("content_ws");
+ Analyzer analyzer = field.getType().getMultiTermAnalyzer();
+ assertTrue(analyzer instanceof TokenizerChain);
+ assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof WhitespaceTokenizerFactory);
+ TokenizerChain tc = (TokenizerChain) analyzer;
+ for (TokenFilterFactory factory : tc.getTokenFilterFactories()) {
+ assertTrue((factory instanceof ASCIIFoldingFilterFactory) || (factory instanceof LowerCaseFilterFactory));
+ }
+
+ assertTrue(tc.getCharFilterFactories().length == 0);
+
+ }
+}
Added: lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/search/TestFoldingMultitermQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/search/TestFoldingMultitermQuery.java?rev=1206258&view=auto
==============================================================================
--- lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/search/TestFoldingMultitermQuery.java (added)
+++ lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/search/TestFoldingMultitermQuery.java Fri Nov 25 16:54:07 2011
@@ -0,0 +1,231 @@
+package org.apache.solr.search;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.IndexWriter;
+import org.apache.solr.SolrTestCaseJ4;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class TestFoldingMultitermQuery extends SolrTestCaseJ4 {
+
+ public String getCoreName() {
+ return "basic";
+ }
+
+ @BeforeClass
+ public static void beforeTests() throws Exception {
+ initCore("solrconfig.xml", "schema-folding.xml");
+ IndexWriter iw;
+
+ String docs[] = {
+ "abcdefg1 finger",
+ "gangs hijklmn1",
+ "opqrstu1 zilly",
+ };
+
+ // prepare the index
+ for (int i = 0; i < docs.length; i++) {
+ String num = Integer.toString(i);
+ String boolVal = ((i % 2) == 0) ? "true" : "false";
+ assertU(adoc("id", num,
+ "int_f", num,
+ "float_f", num,
+ "long_f", num,
+ "double_f", num,
+ "byte_f", num,
+ "short_f", num,
+ "bool_f", boolVal,
+ "date_f", "200" + Integer.toString(i % 10) + "-01-01T00:00:00Z",
+ "content", docs[i],
+ "content_ws", docs[i],
+ "content_rev", docs[i],
+ "content_multi", docs[i],
+ "content_lower_token", docs[i],
+ "content_oldstyle", docs[i],
+ "content_charfilter", docs[i],
+ "content_multi_bad", docs[i]
+ ));
+ }
+ assertU(optimize());
+ }
+
+ @Test
+ public void testPrefixCaseAccentFolding() throws Exception {
+ String matchOneDocPrefixUpper[][] = {
+ {"A*", "ÃB*", "ABÃ*"}, // these should find only doc 0
+ {"H*", "HÃ*", "HìJ*"}, // these should find only doc 1
+ {"O*", "ÃP*", "OPQ*"}, // these should find only doc 2
+ };
+
+ String matchRevPrefixUpper[][] = {
+ {"*Ä1", "*DEfG1", "*EfG1"},
+ {"*N1", "*LmÅ1", "*MÃ1"},
+ {"*Ç1", "*sTu1", "*RÅ TU1"}
+ };
+
+ // test the prefix queries find only one doc where the query is uppercased. Must go through query parser here!
+ for (int idx = 0; idx < matchOneDocPrefixUpper.length; idx++) {
+ for (int jdx = 0; jdx < matchOneDocPrefixUpper[idx].length; jdx++) {
+ String me = matchOneDocPrefixUpper[idx][jdx];
+ assertQ(req("q", "content:" + me),
+ "//*[@numFound='1']",
+ "//*[@name='id'][.='" + Integer.toString(idx) + "']");
+ assertQ(req("q", "content_ws:" + me),
+ "//*[@numFound='1']",
+ "//*[@name='id'][.='" + Integer.toString(idx) + "']");
+ assertQ(req("q", "content_multi:" + me),
+ "//*[@numFound='1']",
+ "//*[@name='id'][.='" + Integer.toString(idx) + "']");
+ assertQ(req("q", "content_lower_token:" + me),
+ "//result[@numFound='1']",
+ "//*[@name='id'][.='" + Integer.toString(idx) + "']");
+ }
+ }
+ for (int idx = 0; idx < matchRevPrefixUpper.length; idx++) {
+ for (int jdx = 0; jdx < matchRevPrefixUpper[idx].length; jdx++) {
+ String me = matchRevPrefixUpper[idx][jdx];
+ assertQ(req("q", "content_rev:" + me),
+ "//*[@numFound='1']",
+ "//*[@name='id'][.='" + Integer.toString(idx) + "']");
+ }
+ }
+ }
+
+ // test the wildcard queries find only one doc where the query is uppercased and/or accented.
+ @Test
+ public void testWildcardCaseAccentFolding() throws Exception {
+ String matchOneDocWildUpper[][] = {
+ {"Ã*C*", "ÃB*1", "ABÃ*g1", "Ã*FG1"}, // these should find only doc 0
+ {"H*k*", "HÃ*l?*", "HìJ*n*", "HìJ*m*"}, // these should find only doc 1
+ {"O*Å*", "ÃP*Å???", "OPQ*S?Å®*", "ÃP*1"}, // these should find only doc 2
+ };
+
+ for (int idx = 0; idx < matchOneDocWildUpper.length; idx++) {
+ for (int jdx = 0; jdx < matchOneDocWildUpper[idx].length; jdx++) {
+ String me = matchOneDocWildUpper[idx][jdx];
+ assertQ("Error with " + me, req("q", "content:" + me),
+ "//result[@numFound='1']",
+ "//*[@name='id'][.='" + Integer.toString(idx) + "']");
+ assertQ(req("q", "content_ws:" + me),
+ "//result[@numFound='1']",
+ "//*[@name='id'][.='" + Integer.toString(idx) + "']");
+ assertQ(req("q", "content_multi:" + me),
+ "//result[@numFound='1']",
+ "//*[@name='id'][.='" + Integer.toString(idx) + "']");
+ assertQ(req("q", "content_lower_token:" + me),
+ "//result[@numFound='1']",
+ "//*[@name='id'][.='" + Integer.toString(idx) + "']");
+ }
+ }
+ }
+
+ // Phrases should fail. This test is mainly a marker so if phrases ever do start working with wildcards we go
+ // and update the documentation
+ @Test
+ public void testPhrase() {
+ assertQ(req("q", "content:\"silly ABCD*\""),
+ "//result[@numFound='0']");
+ }
+
+ // Make sure the legacy behavior flag is honored
+ @Test
+ public void testLegacyBehavior() {
+ assertQ(req("q", "content_oldstyle:ABCD*"),
+ "//result[@numFound='0']");
+ }
+
+ @Test
+ public void testWildcardRange() {
+ assertQ(req("q", "content:[* TO *]"),
+ "//result[@numFound='3']");
+ }
+
+
+ // Does the char filter get correctly handled?
+ @Test
+ public void testCharFilter() {
+ assertQ(req("q", "content_charfilter:" + "Ã*C*"),
+ "//result[@numFound='1']",
+ "//*[@name='id'][.='0']");
+ assertQ(req("q", "content_charfilter:" + "ABÃ*g1"),
+ "//result[@numFound='1']",
+ "//*[@name='id'][.='0']");
+ assertQ(req("q", "content_charfilter:" + "HÃ*l?*"),
+ "//result[@numFound='1']",
+ "//*[@name='id'][.='1']");
+ }
+
+ @Test
+ public void testRangeQuery() {
+ assertQ(req("q", "content:" + "{Ȫp*1 TO QŮ*}"),
+ "//result[@numFound='1']",
+ "//*[@name='id'][.='2']");
+
+ assertQ(req("q", "content:" + "[Ãb* TO f?Ãg?r]"),
+ "//result[@numFound='1']",
+ "//*[@name='id'][.='0']");
+
+ }
+
+ @Test
+ public void testNonTextTypes() {
+ String[] intTypes = {"int_f", "float_f", "long_f", "double_f", "byte_f", "short_f"};
+
+ for (String str : intTypes) {
+ assertQ(req("q", str + ":" + "0"),
+ "//result[@numFound='1']",
+ "//*[@name='id'][.='0']");
+
+ assertQ(req("q", str + ":" + "[0 TO 2]"),
+ "//result[@numFound='3']",
+ "//*[@name='id'][.='0']",
+ "//*[@name='id'][.='1']",
+ "//*[@name='id'][.='2']");
+ }
+ assertQ(req("q", "bool_f:true"),
+ "//result[@numFound='2']",
+ "//*[@name='id'][.='0']",
+ "//*[@name='id'][.='2']");
+
+ assertQ(req("q", "bool_f:[false TO true]"),
+ "//result[@numFound='3']",
+ "//*[@name='id'][.='0']",
+ "//*[@name='id'][.='1']",
+ "//*[@name='id'][.='2']");
+
+ assertQ(req("q", "date_f:2000-01-01T00\\:00\\:00Z"),
+ "//result[@numFound='1']",
+ "//*[@name='id'][.='0']");
+
+ assertQ(req("q", "date_f:[2000-12-31T23:59:59.999Z TO 2002-01-02T00:00:01Z]"),
+ "//result[@numFound='2']",
+ "//*[@name='id'][.='1']",
+ "//*[@name='id'][.='2']");
+ }
+
+ @Test
+ public void testMultiBad() {
+ try {
+ assertQ(req("q", "content_multi_bad:" + "abCD*"));
+ fail("Should throw exception when token evaluates to more than one term");
+ } catch (Exception expected) {
+ assertTrue(expected.getCause() instanceof IllegalArgumentException);
+ }
+ }
+}
\ No newline at end of file
Modified: lucene/dev/branches/branch_3x/solr/example/solr/conf/schema.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/example/solr/conf/schema.xml?rev=1206258&r1=1206257&r2=1206258&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/example/solr/conf/schema.xml (original)
+++ lucene/dev/branches/branch_3x/solr/example/solr/conf/schema.xml Fri Nov 25 16:54:07 2011
@@ -443,6 +443,78 @@
<tokenizer class="solr.PathHierarchyTokenizerFactory"/>
</analyzer>
</fieldType>
+
+ <!-- Illustrates the new "multiterm" analyzer definition the <fieldType> can take a new
+ parameter legacyMultiTerm="true" if the old behvaior is desired. The new default
+ behavior as of 3.6+ is to automatically define a multiterm analyzer
+ -->
+ <fieldType name="text_multiterm" class="solr.TextField" positionIncrementGap="100">
+ <analyzer type="index">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ <!-- Illustrates the use of a new analyzer type "multiterm". See the Wiki page "Multiterm
+ Query Analysis" and SOLR-2438 for full details. The short form is that this analyzer is
+ applied to wildcard terms (prefix, wildcard range) if specified. This allows, among other
+ things, not having to lowercase wildcard terms on the client.
+
+ In the absence of this section, the new default behavior (3.6, 4.0) is to construct
+ one of these from the query analyzer that incorporates any defined charfilters, a
+ WhitespaceTokenizer, a LowerCaseFilter (if defined), and an ASCIIFoldingFilter
+ (if defined).
+
+ Arguably, this is an expert-level analyzer, most cases will be handled by an instance
+ of this being automatically constructed from the queryanalyzer.
+
+ -->
+ <analyzer type="multiterm">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.ASCIIFoldingFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- Illustrates the new "multiterm" analyzer definition the <fieldType> can take a new
+ parameter legacyMultiTerm="true" if the old behvaior is desired. The new default
+ behavior as of 3.6+ is to automatically define a multiterm analyzer
+ -->
+ <fieldType name="text_multiterm" class="solr.TextField" positionIncrementGap="100">
+ <analyzer type="index">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ <!-- Illustrates the use of a new analyzer type "multiterm". See the Wiki page "Multiterm
+ Query Analysis" and SOLR-2438 for full details. The short form is that this analyzer is
+ applied to wildcard terms (prefix, wildcard range) if specified. This allows, among other
+ things, not having to lowercase wildcard terms on the client.
+
+ In the absence of this section, the new default behavior (3.6, 4.0) is to construct
+ one of these from the query analyzer that incorporates any defined charfilters, a
+ WhitespaceTokenizer, a LowerCaseFilter (if defined), and an ASCIIFoldingFilter
+ (if defined).
+
+ Arguably, this is an expert-level analyzer, most cases will be handled by an instance
+ of this being automatically constructed from the queryanalyzer.
+
+ -->
+ <analyzer type="multiterm">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.ASCIIFoldingFilterFactory"/>
+ </analyzer>
+ </fieldType>
<!-- since fields of this type are by default not stored or indexed,
any data added to them will be ignored outright. -->
@@ -552,7 +624,6 @@
<!--
<field name="timestamp" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/>
-->
-
<!-- Dynamic field definitions. If a field name is not found, dynamicFields
will be used if the name matches any of the patterns.