You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by Apache Wiki <wi...@apache.org> on 2009/02/11 00:41:57 UTC

[Solr Wiki] Update of "Deduplication" by YonikSeeley

Dear Wiki user,

You have subscribed to a wiki page or wiki category on "Solr Wiki" for change notification.

The following page has been changed by YonikSeeley:
http://wiki.apache.org/solr/Deduplication

------------------------------------------------------------------------------
  
  Implementations:
  
- || MD5Signature || Used for exact duplicate detection. ||
+ || MD5Signature || 128 bit hash used for exact duplicate detection. ||
+ || Lookup3Signature || 64 bit hash used for exact duplicate detection, much faster than MD5 and smaller to index ||
  || TextProfileSignature || Fuzzy hashing implementation from nutch for near duplicate detection. Its tunable but works best on longer text.||
  
  There are other more sophisticated algorithms for fuzzy/near hashing that could be added later.
@@ -52, +53 @@

    <updateRequestProcessorChain name="dedupe">
      <processor
        class="org.apache.solr.update.processor.SignatureUpdateProcessorFactory">
- 
      </processor>
      <processor class="solr.RunUpdateProcessorFactory" />
    </updateRequestProcessorChain>
@@ -60, +60 @@

  
  Example settings:
  {{{
+   <!-- An example dedup update processor that creates the "id" field on the fly
+        based on the hash code of some other fields.  This example has overwriteDupes
+        set to false since we are using the id field as the signatureField and Solr
+        will maintain uniqueness based on that anyway. -->
    <updateRequestProcessorChain name="dedupe">
-     <processor
-       class="org.apache.solr.update.processor.SignatureUpdateProcessorFactory">
+     <processor class="org.apache.solr.update.processor.SignatureUpdateProcessorFactory">
- 
-         <bool name="enabled">true</bool>
+       <bool name="enabled">true</bool>
+       <str name="signatureField">id</str>
-         <bool name="overwriteDupes">true</bool>
+       <bool name="overwriteDupes">false</bool>
+       <str name="fields">name,features,cat</str>
+       <str name="signatureClass">org.apache.solr.update.processor.Lookup3Signature</str>
-         <arr name="fields"> <str>field1</str> <str>field2</str> </arr>
-  	<str name="signatureClass">
-           org.apache.solr.update.processor.TextProfileSignature
- 	</str>
-         <str name="signatureField">signatureField</str>
-  
      </processor>
+     <processor class="solr.LogUpdateProcessorFactory" />
      <processor class="solr.RunUpdateProcessorFactory" />
    </updateRequestProcessorChain>
  }}}
@@ -89, +89 @@

    </requestHandler>
  }}}
  
+ The update processor can also be specified per request with a parameter of {{{update.processor=dedupe}}}
+ 
  == Settings ==
  
  || '''Setting''' || '''Default''' || '''Description''' ||