You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by da...@apache.org on 2018/10/23 00:05:28 UTC
[07/52] [abbrv] [partial] lucene-solr:jira/gradle: Add gradle support
for Solr
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0ae21ad0/solr/core/src/java/org/apache/solr/handler/component/PhrasesIdentificationComponent.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/handler/component/PhrasesIdentificationComponent.java b/solr/core/src/java/org/apache/solr/handler/component/PhrasesIdentificationComponent.java
deleted file mode 100644
index bac5a4c..0000000
--- a/solr/core/src/java/org/apache/solr/handler/component/PhrasesIdentificationComponent.java
+++ /dev/null
@@ -1,1129 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.solr.handler.component;
-
-import java.io.IOException;
-import java.lang.invoke.MethodHandles;
-import java.util.Arrays;
-import java.util.ArrayList;
-import java.util.BitSet;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.List;
-import java.util.LongSummaryStatistics;
-import java.util.Map;
-import java.util.TreeMap;
-import java.util.stream.Collectors;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.shingle.ShingleFilter;
-import org.apache.lucene.analysis.shingle.ShingleFilterFactory;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
-import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
-import org.apache.lucene.analysis.util.TokenFilterFactory;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.search.Query;
-import org.apache.lucene.search.TermQuery;
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.CharsRefBuilder;
-
-import org.apache.solr.analysis.TokenizerChain;
-import org.apache.solr.client.solrj.SolrResponse;
-import org.apache.solr.common.SolrException;
-import org.apache.solr.common.SolrException.ErrorCode;
-import org.apache.solr.common.params.CommonParams;
-import org.apache.solr.common.params.ModifiableSolrParams;
-import org.apache.solr.common.params.ShardParams;
-import org.apache.solr.common.params.SolrParams;
-import org.apache.solr.common.util.NamedList;
-import org.apache.solr.common.util.SimpleOrderedMap;
-import org.apache.solr.request.SolrQueryRequest;
-import org.apache.solr.search.SolrIndexSearcher;
-import org.apache.solr.schema.FieldType;
-import org.apache.solr.schema.SchemaField;
-import org.apache.solr.util.SolrPluginUtils;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-
-/**
- * A component that can be used in isolation, or in conjunction with {@link QueryComponent} to identify
- * & score "phrases" found in the input string, based on shingles in indexed fields.
- *
- * <p>
- * The most common way to use this component is in conjunction with field that use
- * {@link ShingleFilterFactory} on both the <code>index</code> and <code>query</code> analyzers.
- * An example field type configuration would be something like this...
- * </p>
- * <pre class="prettyprint">
- * <fieldType name="phrases" class="solr.TextField" positionIncrementGap="100">
- * <analyzer type="index">
- * <tokenizer class="solr.StandardTokenizerFactory"/>
- * <filter class="solr.LowerCaseFilterFactory"/>
- * <filter class="solr.ShingleFilterFactory" minShingleSize="2" maxShingleSize="3" outputUnigrams="true"/>
- * </analyzer>
- * <analyzer type="query">
- * <tokenizer class="solr.StandardTokenizerFactory"/>
- * <filter class="solr.LowerCaseFilterFactory"/>
- * <filter class="solr.ShingleFilterFactory" minShingleSize="2" maxShingleSize="7" outputUnigramsIfNoShingles="true" outputUnigrams="true"/>
- * </analyzer>
- * </fieldType>
- * </pre>
- * <p>
- * ...where the <code>query</code> analyzer's <code>maxShingleSize="7"</code> determines the maximum
- * possible phrase length that can be hueristically deduced, the <code>index</code> analyzer's
- * <code>maxShingleSize="3"</code> determines the accuracy of phrases identified. The large the
- * indexed <code>maxShingleSize</code> the higher the accuracy. Both analyzers must include
- * <code>minShingleSize="2" outputUnigrams="true"</code>.
- * </p>
- * <p>
- * With a field type like this, one or more fields can be specified (with weights) via a
- * <code>phrases.fields</code> param to request that this component identify possible phrases in the
- * input <code>q</code> param, or an alternative <code>phrases.q</code> override param. The identified
- * phrases will include their scores relative each field specified, as well an overal weighted score based
- * on the field weights provided by the client. Higher score values indicate a greater confidence in the
- * Phrase.
- * </p>
- *
- * <p>
- * <b>NOTE:</b> In a distributed request, this component uses a single phase (piggy backing on the
- * {@link ShardRequest#PURPOSE_GET_TOP_IDS} generated by {@link QueryComponent} if it is in use) to
- * collect all field & shingle stats. No "refinement" requests are used.
- * </p>
- *
- * @lucene.experimental
- */
-public class PhrasesIdentificationComponent extends SearchComponent {
- private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
-
- /** The only shard purpose that will cause this component to do work & return data during shard req */
- public static final int SHARD_PURPOSE = ShardRequest.PURPOSE_GET_TOP_IDS;
-
- /** Name, also used as a request param to identify whether the user query concerns this component */
- public static final String COMPONENT_NAME = "phrases";
-
- // TODO: ideally these should live in a commons.params class?
- public static final String PHRASE_INPUT = "phrases.q";
- public static final String PHRASE_FIELDS = "phrases.fields";
- public static final String PHRASE_ANALYSIS_FIELD = "phrases.analysis.field";
- public static final String PHRASE_SUMMARY_PRE = "phrases.pre";
- public static final String PHRASE_SUMMARY_POST = "phrases.post";
- public static final String PHRASE_INDEX_MAXLEN = "phrases.maxlength.index";
- public static final String PHRASE_QUERY_MAXLEN = "phrases.maxlength.query";
-
- @Override
- public void prepare(ResponseBuilder rb) throws IOException {
- final SolrParams params = rb.req.getParams();
- if (!params.getBool(COMPONENT_NAME, false)) {
- return;
- }
- if (params.getBool(ShardParams.IS_SHARD, false)) {
- // only one stage/purpose where we should do any work on a shard
- if (0 == (SHARD_PURPOSE & params.getInt(ShardParams.SHARDS_PURPOSE, 0))) {
- return;
- }
- }
-
- // if we're still here, then we should parse & validate our input,
- // putting it in the request context so our process method knows it should do work
- rb.req.getContext().put(this.getClass(), PhrasesContextData.parseAndValidateRequest(rb.req));
- }
-
- @Override
- public int distributedProcess(ResponseBuilder rb) {
- final PhrasesContextData contextData = (PhrasesContextData) rb.req.getContext().get(this.getClass());
- if (null == contextData) {
- // if prepare didn't give us anything to work with, then we should do nothing
- return ResponseBuilder.STAGE_DONE;
- }
-
- if (rb.stage < ResponseBuilder.STAGE_EXECUTE_QUERY) {
- return ResponseBuilder.STAGE_EXECUTE_QUERY;
-
- } else if (rb.stage == ResponseBuilder.STAGE_EXECUTE_QUERY) {
- // if we're being used in conjunction with QueryComponent, it should have already created
- // (in this staged) the only ShardRequest we need...
- for (ShardRequest sreq : rb.outgoing) {
- if (0 != (SHARD_PURPOSE & sreq.purpose) ) {
- return ResponseBuilder.STAGE_GET_FIELDS;
- }
- }
- // ...if we can't find it, then evidently we're being used in isolation,
- // and we need to create our own ShardRequest...
- ShardRequest sreq = new ShardRequest();
- sreq.purpose = SHARD_PURPOSE;
- sreq.params = new ModifiableSolrParams(rb.req.getParams());
- sreq.params.remove(ShardParams.SHARDS);
- rb.addRequest(this, sreq);
- return ResponseBuilder.STAGE_GET_FIELDS;
-
- } else if (rb.stage == ResponseBuilder.STAGE_GET_FIELDS) {
- // NOTE: we don't do any actual work in this stage, but we need to ensure that even if
- // we are being used in isolation w/o QueryComponent that SearchHandler "tracks" a STAGE_GET_FIELDS
- // so that finishStage(STAGE_GET_FIELDS) is called on us and we can add our merged results
- // (w/o needing extra code paths for merging phrase results when QueryComponent is/is not used)
- return ResponseBuilder.STAGE_DONE;
- }
-
- return ResponseBuilder.STAGE_DONE;
- }
-
- @Override
- public void finishStage(ResponseBuilder rb) {
- // NOTE: we don't do this after STAGE_EXECUTE_QUERY because if we're also being used with
- // QueryComponent, we don't want to add our results to the response until *after*
- // QueryComponent adds the main DocList
-
- final PhrasesContextData contextData = (PhrasesContextData) rb.req.getContext().get(this.getClass());
- if (null == contextData || rb.stage != ResponseBuilder.STAGE_GET_FIELDS) {
- // if prepare didn't give us anything to work with, or this isn't our stage, then do nothing
- return;
- }
-
- // sanity check: the shard requests we use/piggy-back on should only hapen once per shard,
- // but let's future proof ourselves against the possibility that some shards might get/respond
- // to the same request "purpose" multiple times...
- final BitSet shardsHandled = new BitSet(rb.shards.length);
-
- // Collect Shard responses
- for (ShardRequest sreq : rb.finished) {
- if (0 != (sreq.purpose & SHARD_PURPOSE)) {
- for (ShardResponse shardRsp : sreq.responses) {
- final int shardNum = rb.getShardNum(shardRsp.getShard());
- if (! shardsHandled.get(shardNum)) {
- shardsHandled.set(shardNum);
- // shards.tolerant=true can cause nulls on exceptions/errors
- // if we don't get phrases/stats from a shard, just ignore that shard
- final SolrResponse rsp = shardRsp.getSolrResponse();
- if (null == rsp) continue;
- final NamedList<Object> top = rsp.getResponse();
- if (null == top) continue;
- final NamedList<Object> phrasesWrapper = (NamedList<Object>) top.get("phrases");
- if (null == phrasesWrapper) continue;
- final List<NamedList<Object>> shardPhrases = (List<NamedList<Object>>) phrasesWrapper.get("_all");
- if (null == shardPhrases) continue;
-
- Phrase.populateStats(contextData.allPhrases, shardPhrases);
- }
- }
- }
- }
- scoreAndAddResultsToResponse(rb, contextData);
- }
-
-
- @Override
- public void process(ResponseBuilder rb) throws IOException {
- final PhrasesContextData contextData = (PhrasesContextData) rb.req.getContext().get(this.getClass());
- if (null == contextData) {
- // if prepare didn't give us anything to work with, then we should do nothing
- return;
- }
-
- // regardless of single node / shard, we need local stats...
- Phrase.populateStats(contextData.allPhrases, contextData.fieldWeights.keySet(), rb.req.getSearcher());
-
- if ( rb.req.getParams().getBool(ShardParams.IS_SHARD, false) ) {
- // shard request, return stats for all phrases (in original order)
- SimpleOrderedMap<Object> output = new SimpleOrderedMap<>();
- output.add("_all", Phrase.formatShardResponse(contextData.allPhrases));
- // TODO: might want to add numDocs() & getSumTotalTermFreq(f)/getDocCount(f) stats from each field...
- // so that we can sum/merge them for use in scoring?
- rb.rsp.add("phrases", output);
- } else {
- // full single node request...
- scoreAndAddResultsToResponse(rb, contextData);
- }
- }
-
- /**
- * Helper method (suitable for both single node & distributed coordinator node) to
- * score, sort, and format the end user response once all phrases have been populated with stats.
- */
- private void scoreAndAddResultsToResponse(final ResponseBuilder rb, final PhrasesContextData contextData) {
- assert null != contextData : "Should not be called if no phrase data to use";
- if (null == contextData) {
- // if prepare didn't give us anything to work with, then we should do nothing
- return;
- }
-
- SimpleOrderedMap<Object> output = new SimpleOrderedMap<>();
- rb.rsp.add("phrases", output);
- output.add("input", contextData.rawInput);
-
- if (0 == contextData.allPhrases.size()) {
- // w/o any phrases, the summary is just the input again...
- output.add("summary", contextData.rawInput);
- output.add("details", Collections.<Object>emptyList());
- return;
- }
-
- Phrase.populateScores(contextData);
- final int maxPosition = contextData.allPhrases.get(contextData.allPhrases.size()-1).getPositionEnd();
-
- final List<Phrase> validScoringPhrasesSorted = contextData.allPhrases.stream()
- // TODO: ideally this cut off of "0.0" should be a request option...
- // so users can tune how aggresive/conservative they want to be in finding phrases
- // but for that to be useful, we need:
- // - more hard & fast documentation about the "range" of scores that may be returned
- // - "useful" scores for single words
- .filter(p -> 0.0D < p.getTotalScore())
- .sorted(Comparator.comparing((p -> p.getTotalScore()), Collections.reverseOrder()))
- .collect(Collectors.toList());
-
- // we want to return only high scoring phrases that don't overlap w/higher scoring phrase
- final BitSet positionsCovered = new BitSet(maxPosition+1);
- final List<Phrase> results = new ArrayList<>(maxPosition);
- for (Phrase phrase : validScoringPhrasesSorted) {
- final BitSet phrasePositions = phrase.getPositionsBitSet();
-
- if (! phrasePositions.intersects(positionsCovered)) {
- // we can use this phrase, record it...
- positionsCovered.or(phrasePositions);
- results.add(phrase);
- } // else: overlaps higher scoring position(s), skip this phrase
-
- if (positionsCovered.cardinality() == maxPosition+1) {
- // all positions are covered, so we can bail out and skip the rest
- break;
- }
- }
-
- // a "quick summary" of the suggested parsing
- output.add("summary", contextData.summarize(results));
- // useful user level info on every (high scoring) phrase found (in current, descending score, order)
- output.add("details", results.stream()
- .map(p -> p.getDetails()).collect(Collectors.toList()));
- }
-
- @Override
- public String getDescription() {
- return "Phrases Identification Component";
- }
-
- /**
- * Simple container for all request options and data this component needs to store in the Request Context
- * @lucene.internal
- */
- public static final class PhrasesContextData {
-
- public final String rawInput;
- public final int maxIndexedPositionLength;
- public final int maxQueryPositionLength;
- public final Map<String,Double> fieldWeights;
- public final SchemaField analysisField;
- public final List<Phrase> allPhrases;
- public final String summaryPre;
- public final String summaryPost;
-
- // TODO: add an option to bias field weights based on sumTTF of the fields
- // (easy enough to "sum the sums" across multiple shards before scoring)
-
- /**
- * Parses the params included in this request, throwing appropriate user level
- * Exceptions for invalid input, and returning a <code>PhrasesContextData</code>
- * suitable for use in this request.
- */
- public static PhrasesContextData parseAndValidateRequest(final SolrQueryRequest req) throws SolrException {
- return new PhrasesContextData(req);
- }
- private PhrasesContextData(final SolrQueryRequest req) throws SolrException {
- final SolrParams params = req.getParams();
-
- this.rawInput = params.get(PHRASE_INPUT, params.get(CommonParams.Q));
- if (null == this.rawInput) {
- throw new SolrException(ErrorCode.BAD_REQUEST, "phrase identification requires a query string or "
- + PHRASE_INPUT + " param override");
- }
-
- { // field weights & analysis field...
-
- SchemaField tmpAnalysisField = null;
- Map<String,Double> tmpWeights = new TreeMap<>();
-
- final String analysisFieldName = params.get(PHRASE_ANALYSIS_FIELD);
- if (null != analysisFieldName) {
- tmpAnalysisField = req.getSchema().getFieldOrNull(analysisFieldName);
- if (null == tmpAnalysisField) {
- throw new SolrException(ErrorCode.BAD_REQUEST,
- PHRASE_ANALYSIS_FIELD + " param specifies a field name that does not exist: " +
- analysisFieldName);
- }
- }
-
- final Map<String,Float> rawFields = SolrPluginUtils.parseFieldBoosts(params.getParams(PHRASE_FIELDS));
- if (rawFields.isEmpty()) {
- throw new SolrException(ErrorCode.BAD_REQUEST,
- PHRASE_FIELDS + " param must specify a (weighted) list of fields " +
- "to evaluate for phrase identification");
- }
-
- for (Map.Entry<String,Float> entry : rawFields.entrySet()) {
- final SchemaField field = req.getSchema().getFieldOrNull(entry.getKey());
- if (null == field) {
- throw new SolrException(ErrorCode.BAD_REQUEST,
- PHRASE_FIELDS + " param contains a field name that does not exist: " +
- entry.getKey());
- }
- if (null == tmpAnalysisField) {
- tmpAnalysisField = field;
- }
- if ( null == analysisFieldName ) {
- if (! field.getType().equals(tmpAnalysisField.getType())) {
- throw new SolrException
- (ErrorCode.BAD_REQUEST,
- "All fields specified in " + PHRASE_FIELDS + " must have the same fieldType, " +
- "or the advanced " + PHRASE_ANALYSIS_FIELD + " option must specify an override");
- }
- }
- // if a weight isn't specified, assume "1.0"
- final double weight = null == entry.getValue() ? 1.0D : entry.getValue();
- if (weight < 0) {
- throw new SolrException(ErrorCode.BAD_REQUEST,
- PHRASE_FIELDS + " param must use non-negative weight value for field " + field.getName());
- }
- tmpWeights.put(entry.getKey(), weight);
- }
- assert null != tmpAnalysisField;
-
- this.analysisField = tmpAnalysisField;
- this.fieldWeights = Collections.unmodifiableMap(tmpWeights);
- }
-
- { // index/query max phrase sizes...
- final FieldType ft = analysisField.getType();
- this.maxIndexedPositionLength = req.getParams().getInt(PHRASE_INDEX_MAXLEN,
- getMaxShingleSize(ft.getIndexAnalyzer()));
- if (this.maxIndexedPositionLength < 0) {
- throw new SolrException(ErrorCode.BAD_REQUEST,
- "Unable to determine max position length of indexed phrases using " +
- "index analyzer for analysis field: " + analysisField.getName() +
- " and no override detected using param: " + PHRASE_INDEX_MAXLEN);
- }
- this.maxQueryPositionLength = req.getParams().getInt(PHRASE_QUERY_MAXLEN,
- getMaxShingleSize(ft.getQueryAnalyzer()));
- if (this.maxQueryPositionLength < 0) {
- throw new SolrException(ErrorCode.BAD_REQUEST,
- "Unable to determine max position length of query phrases using " +
- "query analyzer for analysis field: " + analysisField.getName() +
- " and no override detected using param: " + PHRASE_QUERY_MAXLEN);
- }
- if (this.maxQueryPositionLength < this.maxIndexedPositionLength) {
- throw new SolrException
- (ErrorCode.BAD_REQUEST,
- "Effective value of " + PHRASE_INDEX_MAXLEN + " (either from index analyzer shingle factory, " +
- " or expert param override) must be less then or equal to the effective value of " +
- PHRASE_QUERY_MAXLEN + " (either from query analyzer shingle factory, or expert param override)");
- }
- }
-
- this.summaryPre = params.get(PHRASE_SUMMARY_PRE, "{");
- this.summaryPost = params.get(PHRASE_SUMMARY_POST, "}");
-
- this.allPhrases = Phrase.extractPhrases(this.rawInput, this.analysisField,
- this.maxIndexedPositionLength,
- this.maxQueryPositionLength);
-
- }
-
- /**
- * Given a list of phrases to be returned to the user, summarizes those phrases by decorating the
- * original input string to indicate where the identified phrases exist, using {@link #summaryPre}
- * and {@link #summaryPost}
- *
- * @param results a list of (non overlapping) Phrases that have been identified, sorted from highest scoring to lowest
- * @return the original user input, decorated to indicate the identified phrases
- */
- public String summarize(final List<Phrase> results) {
- final StringBuffer out = new StringBuffer(rawInput);
-
- // sort by *reverse* position so we can go back to front
- final List<Phrase> reversed = results.stream()
- .sorted(Comparator.comparing((p -> p.getPositionStart()), Collections.reverseOrder()))
- .collect(Collectors.toList());
-
- for (Phrase p : reversed) {
- out.insert(p.getOffsetEnd(), summaryPost);
- out.insert(p.getOffsetStart(), summaryPre);
- }
- return out.toString();
- }
- }
-
-
- /**
- * Model the data known about a single (candidate) Phrase -- which may or may not be indexed
- * @lucene.internal
- */
- public static final class Phrase {
-
- /**
- * Factory method for constructing a list of Phrases given the specified input and using the analyzer
- * for the specified field. The <code>maxIndexedPositionLength</code> and
- * <code>maxQueryPositionLength</code> provided *must* match the effective values used by
- * respective analyzers.
- */
- public static List<Phrase> extractPhrases(final String input, final SchemaField analysisField,
- final int maxIndexedPositionLength,
- final int maxQueryPositionLength) {
-
- // TODO: rather then requiring the query analyzer to produce the Phrases for us (assuming Shingles)
- // we could potentially just require that it produces unigrams compatible with the unigrams in the
- // indexed fields, and then build our own Phrases at query time -- making the maxQueryPositionLength
- // a 100% run time configuration option.
- // But that could be tricky given an arbitrary analyzer -- we'd have pay careful attention
- // to positions, and we'd have to guess/assume what placeholders/fillers was used in the indexed Phrases
- // (typically shingles)
-
- assert maxIndexedPositionLength <= maxQueryPositionLength;
-
- final CharsRefBuilder buffer = new CharsRefBuilder();
- final FieldType ft = analysisField.getType();
- final Analyzer analyzer = ft.getQueryAnalyzer();
- final List<Phrase> results = new ArrayList<>(42);
- try (TokenStream tokenStream = analyzer.tokenStream(analysisField.getName(), input)) {
-
- final OffsetAttribute offsetAttr = tokenStream.addAttribute(OffsetAttribute.class);
- final PositionIncrementAttribute posIncAttr = tokenStream.addAttribute(PositionIncrementAttribute.class);
- final PositionLengthAttribute posLenAttr = tokenStream.addAttribute(PositionLengthAttribute.class);
- final TermToBytesRefAttribute termAttr = tokenStream.addAttribute(TermToBytesRefAttribute.class);
-
- int position = 0;
- int lastPosLen = -1;
-
- tokenStream.reset();
- while (tokenStream.incrementToken()) {
- final Phrase phrase = new Phrase();
-
- final int posInc = posIncAttr.getPositionIncrement();
- final int posLen = posLenAttr.getPositionLength();
-
- if (0 == posInc && posLen <= lastPosLen) {
- // This requirement of analyzers to return tokens in ascending order of length
- // is currently neccessary for the "linking" logic below to work
- // if people run into real world sitautions where this is problematic,
- // we can relax this check if we also make the linking logic more complex
- // (ie: less optimzied)
- throw new SolrException
- (ErrorCode.BAD_REQUEST, "Phrase identification currently requires that " +
- "the analyzer used must produce tokens that overlap in increasing order of length. ");
- }
-
- position += posInc;
- lastPosLen = posLen;
-
- phrase.position_start = position;
- phrase.position_end = position + posLen;
-
- phrase.is_indexed = (posLen <= maxIndexedPositionLength);
-
- phrase.offset_start = offsetAttr.startOffset();
- phrase.offset_end = offsetAttr.endOffset();
-
- // populate the subsequence directly from the raw input using the offsets,
- // (instead of using the TermToBytesRefAttribute) so we preserve the original
- // casing, whitespace, etc...
- phrase.subSequence = input.subSequence(phrase.offset_start, phrase.offset_end);
-
- if (phrase.is_indexed) {
- // populate the bytes so we can build term queries
- phrase.bytes = BytesRef.deepCopyOf(termAttr.getBytesRef());
- }
-
- results.add(phrase);
- }
- tokenStream.end();
- } catch (IOException e) {
- throw new SolrException(ErrorCode.SERVER_ERROR,
- "Analysis error extracting phrases from: " + input, e);
- }
-
- // fill in the relationships of each phrase
- //
- // NOTE: this logic currently requries that the phrases are sorted by position ascending
- // (automatic because of how PositionIncrementAttribute works) then by length ascending
- // (when positions are tied).
- // We could de-optimize this code if we find that secondary ordering is too restrictive for
- // some analyzers
- //
- // NOTE changes to scoring model may be allow optimize/prune down the relationships tracked,
- // ...OR.... may require us to add/track more details about sub/parent phrases
- //
- for (int p = 0; p < results.size(); p++) {
- final Phrase current = results.get(p);
- if (! current.is_indexed) {
- // we're not an interesting sub phrase of anything
- continue;
- }
-
- // setup links from the phrase to itself if needed
- addLinkages(current, current, maxIndexedPositionLength);
-
- // scan backwards looking for phrases that might include us...
- BEFORE: for (int i = p-1; 0 <= i; i--) {
- final Phrase previous = results.get(i);
- if (previous.position_start < (current.position_end - maxQueryPositionLength)) {
- // we've scanned so far back nothing else is viable
- break BEFORE;
- }
- // any 'previous' phrases must start where current starts or earlier,
- // so only need to check the end...
- if (current.position_end <= previous.position_end) {
- addLinkages(previous, current, maxIndexedPositionLength);
- }
- }
- // scan forwards looking for phrases that might include us...
- AFTER: for (int i = p+1; i < results.size(); i++) {
- final Phrase next = results.get(i);
- // the only way a phrase that comes after current can include current is
- // if they have the same start position...
- if (current.position_start != next.position_start) {
- // we've scanned so far forward nothing else is viable
- break AFTER;
- }
- // any 'next' phrases must start where current starts, so only need to check the end...
- if (current.position_end <= next.position_end) {
- addLinkages(next, current, maxIndexedPositionLength);
- }
- }
- }
-
- return Collections.unmodifiableList(results);
- }
-
- /**
- * Given two phrases, one of which is a super set of the other, adds the neccessary linkages
- * needed by the scoring model
- */
- private static void addLinkages(final Phrase outer, final Phrase inner,
- final int maxIndexedPositionLength) {
-
- assert outer.position_start <= inner.position_start;
- assert inner.position_end <= outer.position_end;
- assert inner.is_indexed;
-
- final int inner_len = inner.getPositionLength();
- if (1 == inner_len) {
- outer.individualIndexedTerms.add(inner);
- }
- if (maxIndexedPositionLength == inner_len
- || (inner == outer && inner_len < maxIndexedPositionLength)) {
- outer.largestIndexedSubPhrases.add(inner);
- }
- if (outer.is_indexed && inner != outer) {
- inner.indexedSuperPhrases.add(outer);
- }
- }
-
- /**
- * Format the phrases suitable for returning in a shard response
- * @see #populateStats(List,List)
- */
- public static List<NamedList<Object>> formatShardResponse(final List<Phrase> phrases) {
- List<NamedList<Object>> results = new ArrayList<>(phrases.size());
- for (Phrase p : phrases) {
- NamedList<Object> data = new SimpleOrderedMap<>();
- // quick and dirty way to validate that our shards aren't using different analyzers
- // so the coordinating node can fail fast when mergingthe results
- data.add("checksum", p.getChecksum());
- if (p.is_indexed) {
- data.add("ttf", new NamedList<Object>(p.phrase_ttf));
- data.add("df", new NamedList<Object>(p.phrase_df));
- }
- data.add("conj_dc", new NamedList<Object>(p.subTerms_conjunctionCounts));
-
- results.add(data);
- }
- return results;
- }
-
- /**
- * Populates the phrases with (merged) stats from a remote shard
- * @see #formatShardResponse
- */
- public static void populateStats(final List<Phrase> phrases, final List<NamedList<Object>> shardData) {
- final int numPhrases = phrases.size();
- if (shardData.size() != numPhrases) {
- throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
- "num phrases in shard data not consistent: " +
- numPhrases + " vs " + shardData.size());
- }
- for (int i = 0; i < phrases.size(); i++) {
- // rather then being paranoid about the expected structure, we'll just let the low level
- // code throw an NPE / CCE / AIOOBE / etc. and wrap & rethrow later...
- try {
- final Phrase p = phrases.get(i);
- final NamedList<Object> data = shardData.get(i);
- // sanity check the correct phrase
- if (! p.getChecksum().equals(data.get("checksum"))) {
- throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
- "phrase #" + i + " in shard data had invalid checksum");
- }
- if (p.is_indexed) {
- for (Map.Entry<String,Long> ttf : (NamedList<Long>) data.get("ttf")) {
- p.phrase_ttf.merge(ttf.getKey(), ttf.getValue(), Long::sum);
- }
- for (Map.Entry<String,Long> df : (NamedList<Long>) data.get("df")) {
- p.phrase_df.merge(df.getKey(), df.getValue(), Long::sum);
- }
- }
- for (Map.Entry<String,Long> conj_dc : (NamedList<Long>) data.get("conj_dc")) {
- p.subTerms_conjunctionCounts.merge(conj_dc.getKey(), conj_dc.getValue(), Long::sum);
- }
- } catch (RuntimeException e) {
- throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
- "shard data for phrase#" + i + " not consistent", e);
- }
- }
- }
-
- /**
- * Populates the phrases with stats from the local index for the specified fields
- */
- public static void populateStats(final List<Phrase> phrases, final Collection<String> fieldNames,
- final SolrIndexSearcher searcher) throws IOException {
- final IndexReader reader = searcher.getIndexReader();
- for (String field : fieldNames) {
- for (Phrase phrase : phrases) {
- if (phrase.is_indexed) {
- // add stats based on this entire phrase as an indexed term
- final Term t = new Term(field, phrase.bytes);
- phrase.phrase_ttf.put(field, reader.totalTermFreq(t));
- phrase.phrase_df.put(field, (long)reader.docFreq(t));
- }
-
- // even if our phrase is too long to be indexed whole, add stats based on the
- // conjunction of all the individual terms in the phrase
- List<Query> filters = new ArrayList<>(phrase.individualIndexedTerms.size());
- for (Phrase term : phrase.individualIndexedTerms) {
- // trust the SolrIndexSearcher to cache & intersect the individual terms so that this
- // can be efficient regardless of how often terms are re-used multiple times in the input/phrases
- filters.add(new TermQuery(new Term(field, term.bytes)));
- }
- final long count = searcher.getDocSet(filters).size();
- phrase.subTerms_conjunctionCounts.put(field, count);
- }
- }
- }
-
- /**
- * Uses the previously popuated stats to populate each Phrase with it's scores for the specified fields,
- * and it's over all (weighted) total score. This is not needed on shard requests.
- *
- * @see #populateStats
- * @see #getFieldScore(String)
- * @see #getTotalScore
- */
- public static void populateScores(final PhrasesContextData contextData) {
- populateScores(contextData.allPhrases, contextData.fieldWeights,
- contextData.maxIndexedPositionLength,
- contextData.maxQueryPositionLength);
- }
-
- /**
- * Public for testing purposes
- * @see #populateScores(PhrasesIdentificationComponent.PhrasesContextData)
- * @lucene.internal
- */
- public static void populateScores(final List<Phrase> phrases, final Map<String,Double> fieldWeights,
- final int maxIndexedPositionLength,
- final int maxQueryPositionLength) {
- final double total_weight = fieldWeights.values().stream().mapToDouble(Double::doubleValue).sum();
- for (Phrase phrase : phrases) {
- double phrase_cumulative_score = 0.0D;
- for (Map.Entry<String,Double> entry : fieldWeights.entrySet()) {
- final String field = entry.getKey();
- final double weight = entry.getValue();
- double field_score = computeFieldScore(phrase, field,
- maxIndexedPositionLength, maxQueryPositionLength);
- phrase.fieldScores.put(field,field_score);
- phrase_cumulative_score += (field_score * weight);
- }
- phrase.total_score = (total_weight < 0 ? Double.NEGATIVE_INFINITY
- : (phrase_cumulative_score / total_weight));
- }
- }
-
- private Phrase() {
- // No-Op
- }
-
- private boolean is_indexed;
- private double total_score = -1.0D; // until we get a computed score, this is "not a phrase"
-
- private CharSequence subSequence;
- private BytesRef bytes;
- private int offset_start;
- private int offset_end;
- private int position_start;
- private int position_end;
- private Integer checksum = null;
-
- /** NOTE: Indexed phrases of length 1 are the (sole) individual terms of themselves */
- private final List<Phrase> individualIndexedTerms = new ArrayList<>(7);
- /**
- * NOTE: Indexed phrases of length less then the max indexed length are the (sole)
- * largest sub-phrases of themselves
- */
- private final List<Phrase> largestIndexedSubPhrases = new ArrayList<>(7);
- /** Phrases larger then this phrase which are indexed and fully contain it */
- private final List<Phrase> indexedSuperPhrases = new ArrayList<>(7);
-
- // NOTE: keys are field names
- private final Map<String,Long> subTerms_conjunctionCounts = new TreeMap<>();
- private final Map<String,Long> phrase_ttf = new TreeMap<>();
- private final Map<String,Long> phrase_df = new TreeMap<>();
- private final Map<String,Double> fieldScores = new TreeMap<>();
-
- public String toString() {
- return "'" + subSequence + "'"
- + "[" + offset_start + ":" + offset_end + "]"
- + "[" + position_start + ":" + position_end + "]";
- }
-
- public NamedList getDetails() {
- SimpleOrderedMap<Object> out = new SimpleOrderedMap<Object>();
- out.add("text", subSequence);
- out.add("offset_start", getOffsetStart());
- out.add("offset_end", getOffsetEnd());
- out.add("score", getTotalScore());
- out.add("field_scores", fieldScores);
- return out;
- }
-
- /**
- * Computes & caches the checksum of this Phrase (if not already cached).
- * needed only when merging shard data to validate no inconsistencies with the remote shards
- */
- private Integer getChecksum() {
- if (null == checksum) {
- checksum = Arrays.hashCode(new int[] { offset_start, offset_end, position_start, position_end });
- }
- return checksum;
- }
- /** The characters from the original input that corrispond with this Phrase */
- public CharSequence getSubSequence() {
- return subSequence;
- }
-
- /**
- * Returns the list of "individual" (ie: <code>getPositionLength()==1</code> terms.
- * NOTE: Indexed phrases of length 1 are the (sole) individual terms of themselves
- */
- public List<Phrase> getIndividualIndexedTerms() {
- return individualIndexedTerms;
- }
- /**
- * Returns the list of (overlapping) sub phrases that have the largest possible size based on
- * the effective value of {@link PhrasesContextData#maxIndexedPositionLength}.
- * NOTE: Indexed phrases of length less then the max indexed length are the (sole)
- * largest sub-phrases of themselves.
- */
- public List<Phrase> getLargestIndexedSubPhrases() {
- return largestIndexedSubPhrases;
- }
- /**
- * Returns all phrases larger then this phrase, which fully include this phrase, and are indexed.
- * NOTE: A Phrase is <em>never</em> the super phrase of itself.
- */
- public List<Phrase> getIndexedSuperPhrases() {
- return indexedSuperPhrases;
- }
-
- /** NOTE: positions start at '1' */
- public int getPositionStart() {
- return position_start;
- }
- /** NOTE: positions start at '1' */
- public int getPositionEnd() {
- return position_end;
- }
- public int getPositionLength() {
- return position_end - position_start;
- }
- /** Each set bit identifies a position filled by this Phrase */
- public BitSet getPositionsBitSet() {
- final BitSet result = new BitSet();
- result.set(position_start, position_end);
- return result;
- }
- public int getOffsetStart() {
- return offset_start;
- }
- public int getOffsetEnd() {
- return offset_end;
- }
-
- /**
- * Returns the overall score for this Phrase. In the current implementation,
- * the only garuntee made regarding the range of possible values is that 0 (or less) means
- * it is not a good phrase.
- *
- * @return A numeric value indicating the confidence in this Phrase, higher numbers are higher confidence.
- */
- public double getTotalScore() {
- return total_score;
- }
- /**
- * Returns the score for this Phrase in this given field. In the current implementation,
- * the only garuntee made regarding the range of possible values is that 0 (or less) means
- * it is not a good phrase.
- *
- * @return A numeric value indicating the confidence in this Phrase for this field, higher numbers are higher confidence.
- */
- public double getFieldScore(String field) {
- return fieldScores.getOrDefault(field, -1.0D);
- }
-
- /**
- * Returns the number of total TTF of this (indexed) Phrase <em>as term</em> in the specified field.
- * NOTE: behavior of calling this method is undefined unless one of the {@link #populateStats}
- * methods has been called with this field.
- */
- public long getTTF(String field) {
- if (!is_indexed) {
- throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
- "TTF is only available for indexed phrases");
- }
- return phrase_ttf.getOrDefault(field, 0L);
- }
- /**
- * Returns the number of documents that contain <em>all</em> of the {@link #getIndividualIndexedTerms}
- * that make up this Phrase, in the specified field.
- * NOTE: behavior of calling this method is undefined unless one of the {@link #populateStats}
- * methods has been called with this field.
- */
- public long getConjunctionDocCount(String field) {
- return subTerms_conjunctionCounts.getOrDefault(field, 0L);
- }
- /**
- * Returns the number of documents that contain this (indexed) Phrase <em>as term</em>
- * in the specified field.
- * NOTE: behavior of calling this method is undefined unless one of the {@link #populateStats}
- * methods has been called with this field.
- */
- public long getDocFreq(String field) {
- if (!is_indexed) {
- throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
- "DF is only available for indexed phrases");
- }
- return phrase_df.getOrDefault(field, 0L);
- }
-
- /**
- * Uses the previously popuated stats to compute a score for the specified field.
- *
- * <p>
- * The current implementation returns scores in the range of <code>[0,1]</code>, but this
- * may change in future implementations. The only current garuntees are:
- * </p>
- *
- * <ul>
- * <li>0 (or less) means this is garunteed to not be a phrase</li>
- * <li>larger numbers are higher confidence</li>
- * </li>
- *
- * @see #populateStats
- * @see #populateScores
- * @see #getFieldScore(String)
- * @return a score value
- */
- private static double computeFieldScore(final Phrase input,
- final String field,
- final int maxIndexedPositionLength,
- final int maxQueryPositionLength) {
- final long num_indexed_sub_phrases = input.getLargestIndexedSubPhrases().size();
- assert 0 <= num_indexed_sub_phrases; // should be impossible
-
- if (input.getIndividualIndexedTerms().size() < input.getPositionLength()) {
- // there are "gaps" in our input, where individual words have not been indexed (stop words,
- // or multivalue position gap) which means we are not a viable candidate for being a valid Phrase.
- return -1.0D;
- }
-
- final long phrase_conj_count = input.getConjunctionDocCount(field);
- // if there isn't a single document containing all the terms in our
- // phrase, then it is 100% not a phrase
- if (phrase_conj_count <= 0) {
- return -1.0D;
- }
-
- // single words automatically score 0.0 (unless they already scored less for not existing
- if (input.getPositionLength() <= 1) {
- return 0.0D;
- }
-
- double field_score = 0.0D;
- long max_sub_conj_count = phrase_conj_count;
-
- // At the moment, the contribution of each "words" sub-Phrase to the field score to the input
- // Phrase is independent of any context of "input". Depending on if/how sub-phrase scoring
- // changes, we might consider computing the scores of all the indexed phrases first, and
- // aching the portions of their values that are re-used when computing the scores of
- // longer phrases?
- //
- // This would make the overall scoring of all phrases a lot more complicated,
- // but could save CPU cycles?
- // (particularly when maxIndexedPositionLength <<< maxQueryPositionLength ???)
- //
- // My gut says that knowing the conj_count(input) "context" should help us score the
- // sub-phrases better, but i can't yet put my finger on why/how. maybe by comparing
- // the conj_count(input) to the max(conj_count(parent of words)) ?
-
- // for each of the longest indexed phrases, aka indexed sub-sequence of "words", we have...
- for (Phrase words : input.getLargestIndexedSubPhrases()) {
- // we're going to compute scores in range of [-1:1] to indicate the likelihood that our
- // "words" should be used as a "phrase", based on a bayesian document categorization model,
- // where the "words as a phrase" (aka: phrase) is our candidate category.
- //
- // P(words|phrase) * P(phrase) - P(words|not phrase) * P(not phrase)
- //
- // Where...
- // P(words|phrase) = phrase_ttf / min(word_ttf)
- // P(phrase) =~ phrase_docFreq / conj_count(words in phrase) *SEE NOTE BELOW*
- // P(words|not phrase) = phrase_ttf / max(word_ttf)
- // P(not a phrase) = 1 - P(phrase)
- //
- // ... BUT! ...
- //
- // NOTE: we're going to reduce our "P(phrase) by the max "P(phrase)" of all the (indexed)
- // candidate phrases we are a sub-phrase of, to try to offset the inherent bias in favor
- // of small indexed phrases -- because anytime the super-phrase exists, the sub-phrase exists
-
-
- // IDEA: consider replacing this entire baysian model with LLR (or rootLLR)...
- // http://mahout.apache.org/docs/0.13.0/api/docs/mahout-math/org/apache/mahout/math/stats/LogLikelihood.html
- // ...where we compute LLR over each of the TTF of the pairs of adjacent sub-phrases of each
- // indexed phrase and take the min|max|avg of the LLR scores.
- //
- // ie: for indexed shingle "quick brown fox" compute LLR(ttf("quick"), ttf("brown fox")) &
- // LLR(ttf("quick brown"), ttf("fox")) using ttf("quick brown fox") as the co-occurance
- // count, and sumTTF-ttf("quick")-ttf("brown")-ttf("fox") as the "something else"
- //
- // (we could actually compute LLR stats over TTF and DF and combine them)
- //
- // NOTE: Going the LLR/rootLLR route would require building a full "tree" of every (indexed)
- // sub-phrase of every other phrase (or at least: all siblings of diff sizes that add up to
- // an existing phrase). As well as require us to give up on a predictible "range" of
- // legal values for scores (IIUC from the LLR docs)
-
- final long phrase_ttf = words.getTTF(field);
- final long phrase_df = words.getDocFreq(field);
- final long words_conj_count = words.getConjunctionDocCount(field);
- max_sub_conj_count = Math.max(words_conj_count, max_sub_conj_count);
-
- final double max_wrapper_phrase_probability =
- words.getIndexedSuperPhrases().stream()
- .mapToDouble(p -> p.getConjunctionDocCount(field) <= 0 ?
- // special case check -- we already know *our* conj count > 0,
- // but we need a similar check for wrapper phrases: if <= 0, their probability is 0
- 0.0D : ((double)p.getDocFreq(field) / p.getConjunctionDocCount(field))).max().orElse(0.0D);
-
- final LongSummaryStatistics words_ttfs =
- words.getIndividualIndexedTerms().stream()
- .collect(Collectors.summarizingLong(t -> t.getTTF(field)));
-
- final double words_phrase_prob = (phrase_ttf / (double)words_ttfs.getMin());
- final double words_not_phrase_prob = (phrase_ttf / (double)words_ttfs.getMax());
-
- final double phrase_prob = (phrase_conj_count / (double)words_conj_count);
-
-
- final double phrase_score = words_phrase_prob * (phrase_prob - max_wrapper_phrase_probability);
- final double not_phrase_score = words_not_phrase_prob * (1 - (phrase_prob - max_wrapper_phrase_probability));
- final double words_score = phrase_score - not_phrase_score;
-
- field_score += words_score;
- }
-
- // NOTE: the "scaling" factors below can "increase" negative scores (by reducing the unsigned value)
- // when they should ideally be penalizing the scores further, but since we currently don't care
- // about any score lower then 0, it's not worth worrying about.
-
- // Average the accumulated score over the number of actual indexed sub-phrases that contributed
- //
- // NOTE: since we subsequently want to multiply the score by a fraction with num_indexed_sub_phrases
- // in the numerator, we can skip this...
- // SEE BELOW // field_score /= (double) num_indexed_sub_phrases;
-
- // If we leave field_score as is, then a phrase longer then the maxIndexedPositionLength
- // will never score higher then the highest scoring sub-phrase it has (because we've averaged them)
- // so we scale the scores against the longest possible phrase length we're considering
- //
- // NOTE: We don't use num_indexed_sub_phrases in the numerator since we skipped it when
- // averating above...
- field_score *= ( 1.0D // SEE ABOVE // * ( (double)num_indexed_sub_phrases )
- / (1 + maxQueryPositionLength - maxIndexedPositionLength) );
-
- // scale the field_score based on the ratio of the conjunction docCount for the whole phrase
- // realtive to the largest conjunction docCount of it's (largest indexed) sub phrases, to penalize
- // the scores of very long phrases that exist very rarely relative to the how often their
- // sub phrases exist in the index
- field_score *= ( ((double) phrase_conj_count) / max_sub_conj_count);
-
- return field_score;
- }
- }
-
- /**
- * Helper method, public for testing purposes only.
- * <p>
- * Given an analyzer, inspects it to determine if:
- * <ul>
- * <li>it is a {@link TokenizerChain}</li>
- * <li>it contains exactly one instance of {@link ShingleFilterFactory}</li>
- * </ul>
- * <p>
- * If these these conditions are met, then this method returns the <code>maxShingleSize</code>
- * in effect for this analyzer, otherwise returns -1.
- * </p>
- *
- * @param analyzer An analyzer inspect
- * @return <code>maxShingleSize</code> if available
- * @lucene.internal
- */
- public static int getMaxShingleSize(Analyzer analyzer) {
- if (!TokenizerChain.class.isInstance(analyzer)) {
- return -1;
- }
-
- final TokenFilterFactory[] factories = ((TokenizerChain) analyzer).getTokenFilterFactories();
- if (0 == factories.length) {
- return -1;
- }
- int result = -1;
- for (TokenFilterFactory tff : factories) {
- if (ShingleFilterFactory.class.isInstance(tff)) {
- if (0 < result) {
- // more then one shingle factory in our analyzer, which is weird, so make no assumptions...
- return -1;
- }
- // would be nice if there was an easy way to just ask a factory for the effective value
- // of an arguement...
- final Map<String,String> args = tff.getOriginalArgs();
- result = args.containsKey("maxShingleSize")
- ? Integer.parseInt(args.get("maxShingleSize")) : ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE;
- }
- }
- return result;
- }
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0ae21ad0/solr/core/src/java/org/apache/solr/handler/component/PivotFacet.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/handler/component/PivotFacet.java b/solr/core/src/java/org/apache/solr/handler/component/PivotFacet.java
deleted file mode 100644
index 37a522e..0000000
--- a/solr/core/src/java/org/apache/solr/handler/component/PivotFacet.java
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.solr.handler.component;
-
-import java.util.ArrayList;
-import java.util.BitSet;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.solr.common.params.FacetParams;
-import org.apache.solr.common.util.NamedList;
-import org.apache.solr.handler.component.FacetComponent.FacetBase;
-
-/**
- * Models a single instance of a "pivot" specified by a {@link FacetParams#FACET_PIVOT}
- * param, which may contain multiple nested fields.
- *
- * This class is also used to coordinate the refinement requests needed from various
- * shards when doing processing a distributed request
- */
-public class PivotFacet extends FacetBase {
-
- /**
- * Local param used to indicate that refinements are required on a pivot. Should
- * also be used as the prefix for concatenating with the value to determine the
- * name of the multi-valued param that will contain all of the values needed for
- * refinement.
- */
- public static final String REFINE_PARAM = "fpt";
-
- // TODO: is this really needed? can't we just loop over 0<=i<rb.shards.length ?
- public final BitSet knownShards = new BitSet();
-
- private final Map<Integer, List<PivotFacetValue>> queuedRefinements = new HashMap<>();
-
- // if null, then either we haven't collected any responses from shards
- // or all the shards that have responded so far haven't had any values for the top
- // field of this pivot. May be null forever if no doc in any shard has a value
- // for the top field of the pivot
- private PivotFacetField pivotFacetField;
-
- public PivotFacet(ResponseBuilder rb, String facetStr) {
- super(rb, FacetParams.FACET_PIVOT, facetStr);
- }
-
- /**
- * Tracks that the specified shard needs to be asked to refine the specified
- * {@link PivotFacetValue}
- *
- * @see #getQueuedRefinements
- */
- public void addRefinement(int shardNumber, PivotFacetValue value) {
-
- if (!queuedRefinements.containsKey(shardNumber)) {
- queuedRefinements.put(shardNumber, new ArrayList<PivotFacetValue>());
- }
-
- queuedRefinements.get(shardNumber).add(value);
- }
-
- /**
- * An immutable List of the {@link PivotFacetValue}s that need to be
- * refined for this pivot. Once these refinements have been processed,
- * the caller should clear them using {@link #removeAllRefinementsForShard}
- *
- * @see #addRefinement
- * @see #removeAllRefinementsForShard
- * @return a list of the values to refine, or an empty list.
- */
- public List<PivotFacetValue> getQueuedRefinements(int shardNumber) {
- List<PivotFacetValue> raw = queuedRefinements.get(shardNumber);
- if (null == raw) {
- raw = Collections.<PivotFacetValue>emptyList();
- }
- return Collections.unmodifiableList(raw);
- }
-
- /**
- * Clears the list of queued refinements for the specified shard
- *
- * @see #addRefinement
- * @see #getQueuedRefinements
- */
- public void removeAllRefinementsForShard(int shardNumber) {
- queuedRefinements.remove(shardNumber);
- }
-
- /**
- * If true, then additional refinement requests are needed to flesh out the correct
- * counts for this Pivot
- *
- * @see #getQueuedRefinements
- */
- public boolean isRefinementsRequired() {
- return ! queuedRefinements.isEmpty();
- }
-
- /**
- * A recursive method for generating <code>NamedLists</code> for this pivot
- * suitable for including in a pivot facet response to the original distributed request.
- *
- * @see PivotFacetField#trim
- * @see PivotFacetField#convertToListOfNamedLists
- */
- public List<NamedList<Object>> getTrimmedPivotsAsListOfNamedLists() {
- if (null == pivotFacetField) {
- // no values in any shard for the top field of this pivot
- return Collections.<NamedList<Object>>emptyList();
- }
-
- pivotFacetField.trim();
- return pivotFacetField.convertToListOfNamedLists();
- }
-
- /**
- * A recursive method for determining which {@link PivotFacetValue}s need to be
- * refined for this pivot.
- *
- * @see PivotFacetField#queuePivotRefinementRequests
- */
- public void queuePivotRefinementRequests() {
- if (null == pivotFacetField) return; // NOOP
-
- pivotFacetField.sort();
- pivotFacetField.queuePivotRefinementRequests(this);
- }
-
- /**
- * Recursively merges the response from the specified shard, tracking the known shards.
- *
- * @see PivotFacetField#contributeFromShard
- * @see PivotFacetField#createFromListOfNamedLists
- */
- public void mergeResponseFromShard(int shardNumber, ResponseBuilder rb, List<NamedList<Object>> response) {
-
- knownShards.set(shardNumber);
- if (pivotFacetField == null) {
- pivotFacetField = PivotFacetField.createFromListOfNamedLists(shardNumber, rb, null, response);
- } else {
- pivotFacetField.contributeFromShard(shardNumber, rb, response);
- }
- }
-
- public String toString() {
- return "[" + facetStr + "] | " + this.getKey();
- }
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0ae21ad0/solr/core/src/java/org/apache/solr/handler/component/PivotFacetField.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/handler/component/PivotFacetField.java b/solr/core/src/java/org/apache/solr/handler/component/PivotFacetField.java
deleted file mode 100644
index 9b73841..0000000
--- a/solr/core/src/java/org/apache/solr/handler/component/PivotFacetField.java
+++ /dev/null
@@ -1,397 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.solr.handler.component;
-
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Locale;
-import java.util.Map;
-
-import org.apache.solr.common.params.FacetParams;
-import org.apache.solr.common.params.SolrParams;
-import org.apache.solr.common.util.NamedList;
-
-
-/**
- * Models a single field somewhere in a hierarchy of fields as part of a pivot facet.
- * This pivot field contains {@link PivotFacetValue}s which may each contain a nested
- * {@link PivotFacetField} child. This <code>PivotFacetField</code> may itself
- * be a child of a {@link PivotFacetValue} parent.
- *
- * @see PivotFacetValue
- * @see PivotFacetFieldValueCollection
- */
-@SuppressWarnings("rawtypes")
-public class PivotFacetField {
-
- public final String field;
-
- // null if this is a top level pivot,
- // otherwise the value of the parent pivot we are nested under
- public final PivotFacetValue parentValue;
-
- public final PivotFacetFieldValueCollection valueCollection;
-
- // Facet parameters relating to this field
- private final int facetFieldLimit;
- private final int facetFieldMinimumCount;
- private final int facetFieldOffset;
- private final String facetFieldSort;
-
- private final Map<Integer, Integer> numberOfValuesContributedByShard = new HashMap<>();
- private final Map<Integer, Integer> shardLowestCount = new HashMap<>();
-
- private boolean needRefinementAtThisLevel = true;
-
- private PivotFacetField(ResponseBuilder rb, PivotFacetValue parent, String fieldName) {
-
- field = fieldName;
- parentValue = parent;
-
- // facet params
- SolrParams parameters = rb.req.getParams();
- facetFieldMinimumCount = parameters.getFieldInt(field, FacetParams.FACET_PIVOT_MINCOUNT, 1);
- facetFieldOffset = parameters.getFieldInt(field, FacetParams.FACET_OFFSET, 0);
- facetFieldLimit = parameters.getFieldInt(field, FacetParams.FACET_LIMIT, 100);
- String defaultSort = (facetFieldLimit > 0) ? FacetParams.FACET_SORT_COUNT : FacetParams.FACET_SORT_INDEX;
- facetFieldSort = parameters.getFieldParam(field, FacetParams.FACET_SORT, defaultSort);
-
- valueCollection = new PivotFacetFieldValueCollection(facetFieldMinimumCount, facetFieldOffset, facetFieldLimit, facetFieldSort);
-
- if ( (facetFieldLimit < 0) ||
- // TODO: possible refinement issue if limit=0 & mincount=0 & missing=true
- // (ie: we only want the missing count for this field)
- (facetFieldLimit <= 0 && facetFieldMinimumCount == 0) ||
- (facetFieldSort.equals(FacetParams.FACET_SORT_INDEX) && facetFieldMinimumCount <= 0)
- ) {
- // in any of these cases, there's no need to refine this level of the pivot
- needRefinementAtThisLevel = false;
- }
- }
-
- /**
- * A recursive method that walks up the tree of pivot fields/values to build
- * a list of String representations of the values that lead down to this
- * PivotFacetField.
- *
- * @return A mutable List of the pivot values leading down to this pivot field,
- * will never be null but may contain nulls and may be empty if this is a top
- * level pivot field
- * @see PivotFacetValue#getValuePath
- */
- public List<String> getValuePath() {
- if (null != parentValue) {
- return parentValue.getValuePath();
- }
- return new ArrayList<String>(3);
- }
-
- /**
- * A recursive method to construct a new <code>PivotFacetField</code> object from
- * the contents of the {@link NamedList}s provided by the specified shard, relative
- * to a parent value (if this is not the top field in the pivot hierarchy)
- *
- * The associated child {@link PivotFacetValue}s will be recursively built as well.
- *
- * @see PivotFacetValue#createFromNamedList
- * @param shardNumber the id of the shard that provided this data
- * @param rb The response builder of the current request
- * @param owner the parent value in the current pivot (may be null)
- * @param pivotValues the data from the specified shard for this pivot field, may be null or empty
- * @return the new PivotFacetField, null if pivotValues is null or empty.
- */
- public static PivotFacetField createFromListOfNamedLists(int shardNumber, ResponseBuilder rb, PivotFacetValue owner, List<NamedList<Object>> pivotValues) {
-
- if (null == pivotValues || pivotValues.size() <= 0) return null;
-
- NamedList<Object> firstValue = pivotValues.get(0);
- PivotFacetField createdPivotFacetField
- = new PivotFacetField(rb, owner, PivotFacetHelper.getField(firstValue));
-
- int lowestCount = Integer.MAX_VALUE;
-
- for (NamedList<Object> pivotValue : pivotValues) {
-
- lowestCount = Math.min(lowestCount, PivotFacetHelper.getCount(pivotValue));
-
- PivotFacetValue newValue = PivotFacetValue.createFromNamedList
- (shardNumber, rb, createdPivotFacetField, pivotValue);
- createdPivotFacetField.valueCollection.add(newValue);
- }
-
- createdPivotFacetField.shardLowestCount.put(shardNumber, lowestCount);
- createdPivotFacetField.numberOfValuesContributedByShard.put(shardNumber, pivotValues.size());
-
- return createdPivotFacetField;
- }
-
- /**
- * Destructive method that recursively prunes values from the data structure
- * based on the counts for those values and the effective sort, mincount, limit,
- * and offset being used for each field.
- * <p>
- * This method should only be called after all refinement is completed just prior
- * calling {@link #convertToListOfNamedLists}
- * </p>
- *
- * @see PivotFacet#getTrimmedPivotsAsListOfNamedLists
- * @see PivotFacetFieldValueCollection#trim
- */
- public void trim() {
- // SOLR-6331...
- //
- // we can probably optimize the memory usage by trimming each level of the pivot once
- // we know we've fully refined the values at that level
- // (ie: fold this logic into refineNextLevelOfFacets)
- this.valueCollection.trim();
- }
-
- /**
- * Recursively sorts the collection of values associated with this field, and
- * any sub-pivots those values have.
- *
- * @see FacetParams#FACET_SORT
- * @see PivotFacetFieldValueCollection#sort
- */
- public void sort() {
- this.valueCollection.sort();
- }
-
- /**
- * A recursive method for generating <code>NamedLists</code> from this field
- * suitable for including in a pivot facet response to the original distributed request.
- */
- public List<NamedList<Object>> convertToListOfNamedLists() {
-
- List<NamedList<Object>> convertedPivotList = null;
-
- if (valueCollection.size() > 0) {
- convertedPivotList = new LinkedList<>();
- for (PivotFacetValue pivot : valueCollection)
- convertedPivotList.add(pivot.convertToNamedList());
- }
-
- return convertedPivotList;
- }
-
- /**
- * A recursive method for determining which {@link PivotFacetValue}s need to be
- * refined for this pivot.
- *
- * @see PivotFacet#queuePivotRefinementRequests
- */
- public void queuePivotRefinementRequests(PivotFacet pf) {
-
- if (needRefinementAtThisLevel) {
-
- if (0 < facetFieldMinimumCount) {
- // missing is always a candidate for refinement if at least one shard met the minimum
- PivotFacetValue missing = valueCollection.getMissingValue();
- if (null != missing) {
- processDefiniteCandidateElement(pf, valueCollection.getMissingValue());
- }
- }
-
- if (! valueCollection.getExplicitValuesList().isEmpty()) {
-
- if (FacetParams.FACET_SORT_COUNT.equals(facetFieldSort)) {
- // we only need to things that are currently in our limit,
- // or might be in our limit if we get increased counts from shards that
- // didn't include this value the first time
- final int indexOfCountThreshold
- = Math.min(valueCollection.getExplicitValuesListSize(),
- facetFieldOffset + facetFieldLimit) - 1;
- final int countThreshold = valueCollection.getAt(indexOfCountThreshold).getCount();
-
- int positionInResults = 0;
-
- for (PivotFacetValue value : valueCollection.getExplicitValuesList()) {
- if (positionInResults <= indexOfCountThreshold) {
- // This element is within the top results, so we need to get information
- // from all of the shards.
- processDefiniteCandidateElement(pf, value);
- } else {
- // This element is not within the top results, but may still need to be refined.
- processPossibleCandidateElement(pf, value, countThreshold);
- }
-
- positionInResults++;
- }
- } else { // FACET_SORT_INDEX
- // everything needs refined to see what the per-shard mincount excluded
- for (PivotFacetValue value : valueCollection.getExplicitValuesList()) {
- processDefiniteCandidateElement(pf, value);
- }
- }
- }
-
- needRefinementAtThisLevel = false;
- }
-
- if ( pf.isRefinementsRequired() ) {
- // if any refinements are needed, then we need to stop and wait to
- // see how the picture may change before drilling down to child pivot fields
- return;
- } else {
- // Since outstanding requests have been filled, then we can drill down
- // to the next deeper level and check it.
- refineNextLevelOfFacets(pf);
- }
- }
-
- /**
- * Adds refinement requests for the value for each shard that has not already contributed
- * a count for this value.
- */
- private void processDefiniteCandidateElement(PivotFacet pf, PivotFacetValue value) {
-
- for (int shard = pf.knownShards.nextSetBit(0);
- 0 <= shard;
- shard = pf.knownShards.nextSetBit(shard+1)) {
- if ( ! value.shardHasContributed(shard) ) {
- if ( // if we're doing index order, we need to refine anything
- // (mincount may have excluded from a shard)
- FacetParams.FACET_SORT_INDEX.equals(facetFieldSort)
- || (// 'missing' value isn't affected by limit, needs refined if shard didn't provide
- null == value.getValue() ||
- // if we are doing count order, we need to refine if the limit was hit
- // (if not, the shard doesn't have the value or it would have returned already)
- numberOfValuesContributedByShardWasLimitedByFacetFieldLimit(shard))) {
- pf.addRefinement(shard, value);
- }
- }
- }
- }
-
- private boolean numberOfValuesContributedByShardWasLimitedByFacetFieldLimit(int shardNumber) {
- return facetFieldLimit <= numberOfValuesContributedByShard(shardNumber);
- }
-
- private int numberOfValuesContributedByShard(final int shardNumber) {
- return numberOfValuesContributedByShard.containsKey(shardNumber)
- ? numberOfValuesContributedByShard.get(shardNumber)
- : 0;
- }
-
- /**
- * Checks the {@link #lowestCountContributedbyShard} for each shard, combined with the
- * counts we already know, to see if this value is a viable candidate --
- * <b>Does not make sense when using {@link FacetParams#FACET_SORT_INDEX}</b>
- *
- * @see #processDefiniteCandidateElement
- */
- private void processPossibleCandidateElement(PivotFacet pf, PivotFacetValue value,
- final int refinementThreshold) {
-
- assert FacetParams.FACET_SORT_COUNT.equals(facetFieldSort)
- : "Method only makes sense when sorting by count";
-
- int maxPossibleCountAfterRefinement = value.getCount();
-
- for (int shard = pf.knownShards.nextSetBit(0);
- 0 <= shard;
- shard = pf.knownShards.nextSetBit(shard+1)) {
- if ( ! value.shardHasContributed(shard) ) {
- maxPossibleCountAfterRefinement += lowestCountContributedbyShard(shard);
- }
- }
-
- if (refinementThreshold <= maxPossibleCountAfterRefinement) {
- processDefiniteCandidateElement(pf, value);
- }
- }
-
- private int lowestCountContributedbyShard(int shardNumber) {
- return (shardLowestCount.containsKey(shardNumber))
- ? shardLowestCount.get(shardNumber)
- : 0;
- }
-
- private void refineNextLevelOfFacets(PivotFacet pf) {
-
- List<PivotFacetValue> explicitValsToRefine
- = valueCollection.getNextLevelValuesToRefine();
-
- for (PivotFacetValue value : explicitValsToRefine) {
- if (null != value.getChildPivot()) {
- value.getChildPivot().queuePivotRefinementRequests(pf);
- }
- }
-
- PivotFacetValue missing = this.valueCollection.getMissingValue();
- if(null != missing && null != missing.getChildPivot()) {
- missing.getChildPivot().queuePivotRefinementRequests(pf);
- }
- }
-
- private void incrementShardValueCount(int shardNumber) {
- if (!numberOfValuesContributedByShard.containsKey(shardNumber)) {
- numberOfValuesContributedByShard.put(shardNumber, 1);
- } else {
- numberOfValuesContributedByShard.put(shardNumber, numberOfValuesContributedByShard.get(shardNumber)+1);
- }
- }
-
- private void contributeValueFromShard(int shardNumber, ResponseBuilder rb, NamedList<Object> shardValue) {
-
- incrementShardValueCount(shardNumber);
-
- Comparable value = PivotFacetHelper.getValue(shardValue);
- int count = PivotFacetHelper.getCount(shardValue);
-
- // We're changing values so we most mark the collection as dirty
- valueCollection.markDirty();
-
- if ( ( !shardLowestCount.containsKey(shardNumber) )
- || shardLowestCount.get(shardNumber) > count) {
- shardLowestCount.put(shardNumber, count);
- }
-
- PivotFacetValue facetValue = valueCollection.get(value);
- if (null == facetValue) {
- // never seen before, we need to create it from scratch
- facetValue = PivotFacetValue.createFromNamedList(shardNumber, rb, this, shardValue);
- this.valueCollection.add(facetValue);
- } else {
- facetValue.mergeContributionFromShard(shardNumber, rb, shardValue);
- }
- }
-
- /**
- * Recursively merges the contributions from the specified shard for each
- * {@link PivotFacetValue} represended in the <code>response</code>.
- *
- * @see PivotFacetValue#mergeContributionFromShard
- * @param shardNumber the id of the shard that provided this data
- * @param rb The response builder of the current request
- * @param response the data from the specified shard for this pivot field, may be null
- */
- public void contributeFromShard(int shardNumber, ResponseBuilder rb, List<NamedList<Object>> response) {
- if (null == response) return;
-
- for (NamedList<Object> responseValue : response) {
- contributeValueFromShard(shardNumber, rb, responseValue);
- }
- }
-
- public String toString(){
- return String.format(Locale.ROOT, "P:%s F:%s V:%s",
- parentValue, field, valueCollection);
- }
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0ae21ad0/solr/core/src/java/org/apache/solr/handler/component/PivotFacetFieldValueCollection.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/handler/component/PivotFacetFieldValueCollection.java b/solr/core/src/java/org/apache/solr/handler/component/PivotFacetFieldValueCollection.java
deleted file mode 100644
index 5c2b07f..0000000
--- a/solr/core/src/java/org/apache/solr/handler/component/PivotFacetFieldValueCollection.java
+++ /dev/null
@@ -1,341 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.solr.handler.component;
-
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Locale;
-import java.util.Map;
-
-import org.apache.solr.common.params.FacetParams;
-
-/**
- * Emcapsulates a collection of {@link PivotFacetValue}s associated with a
- * {@link PivotFacetField} withs pecial tracking of a {@link PivotFacetValue}
- * corrisponding to the <code>null</code> value when {@link FacetParams#FACET_MISSING}
- * is used.
- *
- * @see #markDirty
- * @see PivotFacetField
- * @see PivotFacetValue
- */
-@SuppressWarnings("rawtypes")
-public class PivotFacetFieldValueCollection implements Iterable<PivotFacetValue> {
- private List<PivotFacetValue> explicitValues;
- private PivotFacetValue missingValue;
- private Map<Comparable, PivotFacetValue> valuesMap;
- private boolean dirty = true;
-
- //Facet parameters relating to this field
- private final int facetFieldMinimumCount;
- private final int facetFieldOffset;
- private final int facetFieldLimit;
- private final String facetFieldSort;
-
-
- public PivotFacetFieldValueCollection(int minCount, int offset, int limit, String fieldSort){
- this.explicitValues = new ArrayList<>();
- this.valuesMap = new HashMap<>();
- this.facetFieldMinimumCount = minCount;
- this.facetFieldOffset = offset;
- this.facetFieldLimit = limit;
- this.facetFieldSort = fieldSort;
- }
-
- /**
- * Indicates that the values in this collection have been modified by the caller.
- *
- * Any caller that manipulates the {@link PivotFacetValue}s contained in this collection
- * must call this method after doing so.
- */
- public void markDirty() {
- dirty = true;
- }
-
- /**
- * The {@link PivotFacetValue} with corisponding to a a value of
- * <code>null</code> when {@link FacetParams#FACET_MISSING} is used.
- *
- * @return the appropriate <code>PivotFacetValue</code> object, may be null
- * if we "missing" is not in use, or if it does not meat the mincount.
- */
- public PivotFacetValue getMissingValue(){
- return missingValue;
- }
-
- /**
- * Read-Only access to the Collection of {@link PivotFacetValue}s corrisponding to
- * non-missing values.
- *
- * @see #getMissingValue
- */
- public List<PivotFacetValue> getExplicitValuesList() {
- return Collections.unmodifiableList(explicitValues);
- }
-
- /**
- * Size of {@link #getExplicitValuesList}
- */
- public int getExplicitValuesListSize() {
- return this.explicitValues.size();
- }
-
- /**
- * Total number of {@link PivotFacetValue}s, including the "missing" value if used.
- *
- * @see #getMissingValue
- * @see #getExplicitValuesList
- */
- public int size() {
- return this.getExplicitValuesListSize() + (this.missingValue == null ? 0 : 1);
- }
-
- /**
- * Returns the appropriate sub-list of the explicit values that need to be refined,
- * based on the {@link FacetParams#FACET_OFFSET} & {@link FacetParams#FACET_LIMIT}
- * for this field.
- *
- * @see #getExplicitValuesList
- * @see List#subList
- */
- public List<PivotFacetValue> getNextLevelValuesToRefine() {
- final int numRefinableValues = getExplicitValuesListSize();
- if (facetFieldOffset < numRefinableValues) {
- final int offsetPlusCount = (facetFieldLimit >= 0)
- ? Math.min(facetFieldLimit + facetFieldOffset, numRefinableValues)
- : numRefinableValues;
- return getExplicitValuesList().subList(facetFieldOffset, offsetPlusCount);
- } else {
- return Collections.<PivotFacetValue>emptyList();
- }
- }
-
- /**
- * Fast lookup to retrieve a {@link PivotFacetValue} from this collection if it
- * exists
- *
- * @param value of the <code>PivotFacetValue</code> to lookup, if
- * <code>null</code> this returns the same as {@link #getMissingValue}
- * @return the corrisponding <code>PivotFacetValue</code> or null if there is
- * no <code>PivotFacetValue</code> in this collection corrisponding to
- * the specified value.
- */
- public PivotFacetValue get(Comparable value){
- return valuesMap.get(value);
- }
-
- /**
- * Fetchs a {@link PivotFacetValue} from this collection via the index, may not
- * be used to fetch the <code>PivotFacetValue</code> corrisponding to the missing-value.
- *
- * @see #getExplicitValuesList
- * @see List#get(int)
- * @see #getMissingValue
- */
- public PivotFacetValue getAt(int index){
- return explicitValues.get(index);
- }
-
- /**
- * Adds a {@link PivotFacetValue} to this collection -- callers must not use this
- * method if a {@link PivotFacetValue} with the same value already exists in this collection
- */
- public void add(PivotFacetValue pfValue) {
- Comparable val = pfValue.getValue();
- assert ! this.valuesMap.containsKey(val)
- : "Must not add duplicate PivotFacetValue with redundent inner value";
-
- dirty = true;
- if(null == val) {
- this.missingValue = pfValue;
- } else {
- this.explicitValues.add(pfValue);
- }
- this.valuesMap.put(val, pfValue);
- }
-
-
- /**
- * Destructive method that recursively prunes values from the data structure
- * based on the counts for those values and the effective sort, mincount, limit,
- * and offset being used for each field.
- * <p>
- * This method should only be called after all refinement is completed.
- * </p>
- *
- * @see PivotFacetField#trim
- * @see PivotFacet#getTrimmedPivotsAsListOfNamedLists
- */
- public void trim() { // NOTE: destructive
- // TODO: see comment in PivotFacetField about potential optimization
- // (ie: trim as we refine)
- trimNonNullValues();
- trimNullValue();
- }
-
- private void trimNullValue(){
- if (missingValue == null) {
- return;
- }
-
- if (missingValue.getCount() >= facetFieldMinimumCount){
- if (null != missingValue.getChildPivot()) {
- missingValue.getChildPivot().trim();
- }
- } else { // missing count less than mincount
- missingValue = null;
- }
- }
-
- private void trimNonNullValues(){
- if (explicitValues != null && explicitValues.size() > 0) {
-
- sort();
-
- ArrayList<PivotFacetValue> trimmedValues = new ArrayList<>();
-
- int facetsSkipped = 0;
-
- for (PivotFacetValue pivotValue : explicitValues) {
-
- if (pivotValue.getCount() >= facetFieldMinimumCount) {
- if (facetsSkipped >= facetFieldOffset) {
- trimmedValues.add(pivotValue);
- if (pivotValue.getChildPivot() != null) {
- pivotValue.getChildPivot().trim();
- }
- if (facetFieldLimit > 0 && trimmedValues.size() >= facetFieldLimit) {
- break;
- }
- } else {
- facetsSkipped++;
- }
- }
- }
-
- explicitValues = trimmedValues;
- valuesMap.clear();
- }
- }
-
- /**
- * Sorts the collection and recursively sorts the collections assocaited with
- * any sub-pivots.
- *
- * @see FacetParams#FACET_SORT
- * @see PivotFacetField#sort
- */
- public void sort() {
-
- if (dirty) {
- if (facetFieldSort.equals(FacetParams.FACET_SORT_COUNT)) {
- Collections.sort(this.explicitValues, new PivotFacetCountComparator());
- } else if (facetFieldSort.equals(FacetParams.FACET_SORT_INDEX)) {
- Collections.sort(this.explicitValues, new PivotFacetValueComparator());
- }
- dirty = false;
- }
-
- for (PivotFacetValue value : this.explicitValues)
- if (value.getChildPivot() != null) {
- value.getChildPivot().sort();
- }
-
- if (missingValue != null && missingValue.getChildPivot() != null) {
- missingValue.getChildPivot().sort();
- }
- }
-
- /**
- * Iterator over all elements in this Collection, including the result of
- * {@link #getMissingValue} as the last element (if it exists)
- */
- @Override
- public Iterator<PivotFacetValue> iterator() {
- Iterator<PivotFacetValue> it = new Iterator<PivotFacetValue>() {
- private final Iterator valuesIterator = explicitValues.iterator();
- private boolean shouldGiveMissingValue = (missingValue != null);
-
- @Override
- public boolean hasNext() {
- return valuesIterator.hasNext() || shouldGiveMissingValue;
- }
-
- @Override
- public PivotFacetValue next() {
- while(valuesIterator.hasNext()){
- return (PivotFacetValue) valuesIterator.next();
- }
- //else
- if(shouldGiveMissingValue){
- shouldGiveMissingValue = false;
- return missingValue;
- }
- return null;
- }
-
- @Override
- public void remove() {
- throw new UnsupportedOperationException("Can't remove from this iterator");
- }
- };
- return it;
- }
-
- /** Sorts {@link PivotFacetValue} instances by their count */
- public static class PivotFacetCountComparator implements Comparator<PivotFacetValue> {
- public int compare(PivotFacetValue left, PivotFacetValue right) {
- int countCmp = right.getCount() - left.getCount();
- return (0 != countCmp) ? countCmp :
- compareWithNullLast(left.getValue(), right.getValue());
- }
- }
-
- /** Sorts {@link PivotFacetValue} instances by their value */
- public static class PivotFacetValueComparator implements Comparator<PivotFacetValue> {
- public int compare(PivotFacetValue left, PivotFacetValue right) {
- return compareWithNullLast(left.getValue(), right.getValue());
- }
- }
-
- /**
- * A helper method for use in <code>Comparator</code> classes where object properties
- * are <code>Comparable</code> but may be null.
- */
- static int compareWithNullLast(final Comparable o1, final Comparable o2) {
- if (null == o1) {
- if (null == o2) {
- return 0;
- }
- return 1; // o1 is null, o2 is not
- }
- if (null == o2) {
- return -1; // o2 is null, o1 is not
- }
- return o1.compareTo(o2);
- }
-
- public String toString(){
- return String.format(Locale.ROOT, "Values:%s | Missing:%s ", explicitValues, missingValue);
- }
-}
-
-