You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-user@lucene.apache.org by Ravi Solr <ra...@gmail.com> on 2012/11/19 17:40:00 UTC

copyField multiValued duplicates

Hello,
      I have a couple of questions. I need an easy way to clean up a gaffe
with copyFields (close to a million docs). Is there any way we could remove
duplicates emitted via copyField while re-indexing ? Also is there a way to
query multiValued fields to give only docs that have duplicated value ??

The fields having issue are declared as follows

    <fieldType name="keywordText" class="solr.TextField"
sortMissingLast="true" omitNorms="true" positionIncrementGap="100">
  <analyzer type="index">                <tokenizer
class="solr.KeywordTokenizerFactory"/>        <filter
class="solr.TrimFilterFactory" />        <filter
class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"
enablePositionIncrements="true"/>         <filter
class="solr.SynonymFilterFactory"
tokenizerFactory="solr.KeywordTokenizerFactory"
synonyms="person-synonyms.txt,organization-synonyms.txt,location-synonyms.txt,subject-synonyms.txt"
ignoreCase="true" expand="false" />        <filter
class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>      <analyzer type="query">        <tokenizer
class="solr.KeywordTokenizerFactory"/>        <filter
class="solr.TrimFilterFactory" />        <filter
class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"
enablePositionIncrements="true" />                <filter
class="solr.SynonymFilterFactory"
tokenizerFactory="solr.KeywordTokenizerFactory"
synonyms="person-synonyms.txt,organization-synonyms.txt,location-synonyms.txt,subject-synonyms.txt"
ignoreCase="true" expand="false" />
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>    </fieldType>
    <fieldType name="text" class="solr.TextField"
sortMissingLast="true" omitNorms="true" positionIncrementGap="100">
  <analyzer type="index">        <tokenizer
class="solr.WhitespaceTokenizerFactory"/>        <filter
class="solr.TrimFilterFactory" />        <filter
class="solr.LowerCaseFilterFactory"/>        <filter
class="solr.SynonymFilterFactory"
synonyms="person-synonyms.txt,organization-synonyms.txt,location-synonyms.txt,subject-synonyms.txt"
ignoreCase="true" expand="true"/>                        <!-- Case
insensitive stop word removal. enablePositionIncrements=true ensures
that a 'gap' is left to allow for accurate phrase queries. -->
<filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt" enablePositionIncrements="true" />
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="0"
generateNumberParts="0" catenateWords="0" catenateNumbers="0"
catenateAll="0" splitOnCaseChange="0" protected="protwords.txt"/>
  <filter class="solr.EnglishPorterFilterFactory"
protected="protwords.txt"/>        <filter
class="solr.RemoveDuplicatesTokenFilterFactory"/>      </analyzer>
 <analyzer type="query">        <tokenizer
class="solr.WhitespaceTokenizerFactory"/>        <filter
class="solr.TrimFilterFactory" />        <filter
class="solr.LowerCaseFilterFactory"/>                <filter
class="solr.SynonymFilterFactory"
synonyms="person-synonyms.txt,organization-synonyms.txt,location-synonyms.txt,subject-synonyms.txt"
ignoreCase="true" expand="true"/>                <filter
class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"
enablePositionIncrements="true" />        <filter
class="solr.WordDelimiterFilterFactory" generateWordParts="0"
generateNumberParts="0" catenateWords="0" catenateNumbers="0"
catenateAll="0" splitOnCaseChange="0" protected="protwords.txt"/>
  <filter class="solr.EnglishPorterFilterFactory"
protected="protwords.txt"/>        <filter
class="solr.RemoveDuplicatesTokenFilterFactory"/>      </analyzer>
</fieldType>

<field name="city" type="keywordText" indexed="true" stored="true"
multiValued="true" termVectors="true"/>
<field name="cityLower" type="text" indexed="true" stored="true"
multiValued="true" termVectors="false"/>

<copyField source="city" dest="cityLower"/>

Query results look as follows

<arr name="city">
    <str>No city</str>
</arr>
<arr name="cityLower">
   <str>No city</str>
   <str>No city</str>
   <str>No city</str>
   <str>No city</str>
   <str>No city</str>
   <str>No city</str>
   <str>No city</str>
   <str>No city</str>
   <str>No city</str>
   <str>No city</str>
</arr>

Thanks,

Ravi Kiran Bhaskar