You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-user@lucene.apache.org by Ravi Solr <ra...@gmail.com> on 2012/11/19 17:40:00 UTC
copyField multiValued duplicates
Hello,
I have a couple of questions. I need an easy way to clean up a gaffe
with copyFields (close to a million docs). Is there any way we could remove
duplicates emitted via copyField while re-indexing ? Also is there a way to
query multiValued fields to give only docs that have duplicated value ??
The fields having issue are declared as follows
<fieldType name="keywordText" class="solr.TextField"
sortMissingLast="true" omitNorms="true" positionIncrementGap="100">
<analyzer type="index"> <tokenizer
class="solr.KeywordTokenizerFactory"/> <filter
class="solr.TrimFilterFactory" /> <filter
class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"
enablePositionIncrements="true"/> <filter
class="solr.SynonymFilterFactory"
tokenizerFactory="solr.KeywordTokenizerFactory"
synonyms="person-synonyms.txt,organization-synonyms.txt,location-synonyms.txt,subject-synonyms.txt"
ignoreCase="true" expand="false" /> <filter
class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer> <analyzer type="query"> <tokenizer
class="solr.KeywordTokenizerFactory"/> <filter
class="solr.TrimFilterFactory" /> <filter
class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"
enablePositionIncrements="true" /> <filter
class="solr.SynonymFilterFactory"
tokenizerFactory="solr.KeywordTokenizerFactory"
synonyms="person-synonyms.txt,organization-synonyms.txt,location-synonyms.txt,subject-synonyms.txt"
ignoreCase="true" expand="false" />
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer> </fieldType>
<fieldType name="text" class="solr.TextField"
sortMissingLast="true" omitNorms="true" positionIncrementGap="100">
<analyzer type="index"> <tokenizer
class="solr.WhitespaceTokenizerFactory"/> <filter
class="solr.TrimFilterFactory" /> <filter
class="solr.LowerCaseFilterFactory"/> <filter
class="solr.SynonymFilterFactory"
synonyms="person-synonyms.txt,organization-synonyms.txt,location-synonyms.txt,subject-synonyms.txt"
ignoreCase="true" expand="true"/> <!-- Case
insensitive stop word removal. enablePositionIncrements=true ensures
that a 'gap' is left to allow for accurate phrase queries. -->
<filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt" enablePositionIncrements="true" />
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="0"
generateNumberParts="0" catenateWords="0" catenateNumbers="0"
catenateAll="0" splitOnCaseChange="0" protected="protwords.txt"/>
<filter class="solr.EnglishPorterFilterFactory"
protected="protwords.txt"/> <filter
class="solr.RemoveDuplicatesTokenFilterFactory"/> </analyzer>
<analyzer type="query"> <tokenizer
class="solr.WhitespaceTokenizerFactory"/> <filter
class="solr.TrimFilterFactory" /> <filter
class="solr.LowerCaseFilterFactory"/> <filter
class="solr.SynonymFilterFactory"
synonyms="person-synonyms.txt,organization-synonyms.txt,location-synonyms.txt,subject-synonyms.txt"
ignoreCase="true" expand="true"/> <filter
class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"
enablePositionIncrements="true" /> <filter
class="solr.WordDelimiterFilterFactory" generateWordParts="0"
generateNumberParts="0" catenateWords="0" catenateNumbers="0"
catenateAll="0" splitOnCaseChange="0" protected="protwords.txt"/>
<filter class="solr.EnglishPorterFilterFactory"
protected="protwords.txt"/> <filter
class="solr.RemoveDuplicatesTokenFilterFactory"/> </analyzer>
</fieldType>
<field name="city" type="keywordText" indexed="true" stored="true"
multiValued="true" termVectors="true"/>
<field name="cityLower" type="text" indexed="true" stored="true"
multiValued="true" termVectors="false"/>
<copyField source="city" dest="cityLower"/>
Query results look as follows
<arr name="city">
<str>No city</str>
</arr>
<arr name="cityLower">
<str>No city</str>
<str>No city</str>
<str>No city</str>
<str>No city</str>
<str>No city</str>
<str>No city</str>
<str>No city</str>
<str>No city</str>
<str>No city</str>
<str>No city</str>
</arr>
Thanks,
Ravi Kiran Bhaskar