You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by Apache Wiki <wi...@apache.org> on 2009/02/11 00:41:57 UTC
[Solr Wiki] Update of "Deduplication" by YonikSeeley
Dear Wiki user,
You have subscribed to a wiki page or wiki category on "Solr Wiki" for change notification.
The following page has been changed by YonikSeeley:
http://wiki.apache.org/solr/Deduplication
------------------------------------------------------------------------------
Implementations:
- || MD5Signature || Used for exact duplicate detection. ||
+ || MD5Signature || 128 bit hash used for exact duplicate detection. ||
+ || Lookup3Signature || 64 bit hash used for exact duplicate detection, much faster than MD5 and smaller to index ||
|| TextProfileSignature || Fuzzy hashing implementation from nutch for near duplicate detection. Its tunable but works best on longer text.||
There are other more sophisticated algorithms for fuzzy/near hashing that could be added later.
@@ -52, +53 @@
<updateRequestProcessorChain name="dedupe">
<processor
class="org.apache.solr.update.processor.SignatureUpdateProcessorFactory">
-
</processor>
<processor class="solr.RunUpdateProcessorFactory" />
</updateRequestProcessorChain>
@@ -60, +60 @@
Example settings:
{{{
+ <!-- An example dedup update processor that creates the "id" field on the fly
+ based on the hash code of some other fields. This example has overwriteDupes
+ set to false since we are using the id field as the signatureField and Solr
+ will maintain uniqueness based on that anyway. -->
<updateRequestProcessorChain name="dedupe">
- <processor
- class="org.apache.solr.update.processor.SignatureUpdateProcessorFactory">
+ <processor class="org.apache.solr.update.processor.SignatureUpdateProcessorFactory">
-
- <bool name="enabled">true</bool>
+ <bool name="enabled">true</bool>
+ <str name="signatureField">id</str>
- <bool name="overwriteDupes">true</bool>
+ <bool name="overwriteDupes">false</bool>
+ <str name="fields">name,features,cat</str>
+ <str name="signatureClass">org.apache.solr.update.processor.Lookup3Signature</str>
- <arr name="fields"> <str>field1</str> <str>field2</str> </arr>
- <str name="signatureClass">
- org.apache.solr.update.processor.TextProfileSignature
- </str>
- <str name="signatureField">signatureField</str>
-
</processor>
+ <processor class="solr.LogUpdateProcessorFactory" />
<processor class="solr.RunUpdateProcessorFactory" />
</updateRequestProcessorChain>
}}}
@@ -89, +89 @@
</requestHandler>
}}}
+ The update processor can also be specified per request with a parameter of {{{update.processor=dedupe}}}
+
== Settings ==
|| '''Setting''' || '''Default''' || '''Description''' ||