You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@lucene.apache.org by Kashif Khadim <ka...@yahoo.com> on 2005/08/04 01:17:06 UTC
duplicates from multiple index
Hi ,
I have multiple index of lucene and want know how can
i delete duplicates from these index. I am using
MultiSearcher to search on these. I have duplicates
"urls" in these index, any sample code or tool will
be a big help.
Thanks,
Kashif.
__________________________________________________
Do You Yahoo!?
Tired of spam? Yahoo! Mail has the best spam protection around
http://mail.yahoo.com
---------------------------------------------------------------------
To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-dev-help@lucene.apache.org
Re: duplicates from multiple index
Posted by Kashif Khadim <ka...@yahoo.com>.
Hi David,
It works very well and thanks a lot for your help.
Kashif
--- David Spencer <da...@tropo.com> wrote:
> Kashif Khadim wrote:
>
> > Hi ,
> >
> > I have multiple index of lucene and want know how
> can
> > i delete duplicates from these index. I am using
> > MultiSearcher to search on these. I have
> duplicates
> > "urls" in these index, any sample code or tool
> will
> > be a big help.
>
> Here's some ancient code that I've used - consider
> it a "fragment" i.e.
> it won't compile as it needs a couple of other
> classes, but it should be
> obvious what's missing (.e.g. DFields.URL is
> something like "url").
>
>
>
> package com.tropo.lucene;
>
> import org.apache.lucene.analysis.*;
> import org.apache.lucene.index.*;
> import org.apache.lucene.document.*;
> import org.apache.lucene.analysis.standard.*;
>
> import java.io.*;
> import java.util.*;
> import com.tropo.lang.*;
>
> /**
> *
> */
> public class DupUrls
> extends com.tropo.TRBase
> {
> public static void main(String[] args)
> throws Throwable
> {
> boolean show = false;
> if ( args.length > 0)
> {
> for ( int i = 0; i< args.length; i++)
> {
> if ( args[ i].equals( "-s"))
> {
> show = true;
> continue;
> }
> if (show)
> showdups( args[ i]);
> else
> dedup( args[ i]);
> }
> System.exit( 0);
> }
> String[] ar = new File( ".").list();
> for ( int ii = 0; ii < ar.length; ii++)
> {
> if ( ar[ ii].endsWith( "_index") &&
> new File( ar[ ii]).isDirectory())
> {
> dedup( ar[ ii]);
> }
> }
> System.exit( 0);
> }
>
> /**
> *
> */
> private static void showdups( String foo)
> throws Throwable
> {
> IndexReader r = IndexReader.open( foo);
> Set already = new HashSet();
> int num = r.numDocs();
> for ( int i = 0; i < num; i++)
> {
> if ( r.isDeleted( i)) continue;
> Document d = r.document( i);
> String u = d.get( DFields.URL);
> if ( already.contains( u))
> o.println( u);
> else
> already.add( u);
> }
> r.close();
> }
>
> /**
> *
> */
> private static void dedup( String foo)
> throws Throwable
> {
> IndexReader r = IndexReader.open( foo);
> int num = r.numDocs();
> o.println( foo + ": " + num);
> Set already = new HashSet();
> int nd = 0;
> int stale = 0;
> PrintStream ps = new PrintStream( new
> FileOutputStream( foo + ".txt"));
> ps.println( "File: " + foo);
> ps.println( "Docs: " + nf.format( num));
> ps.println();
> for ( int i = 0; i < num; i++)
> {
>
> if ( r.isDeleted( i))
> {
> stale++;
> continue;
> }
> Document d = r.document( i);
> String u = d.get( DFields.URL);
> if ( already.contains( u))
> {
> r.delete( i);
> nd++;
> ps.println( u);
> if ( false)
> {
> Enumeration e = d.fields();
> while ( e.hasMoreElements())
> {
> Field z = (Field) e.nextElement();
> ps.println( "\t\t\tname=" + z.name() + " sv="
> +
> "\"" + z.toString() + "\"" +
> "\"" + z.stringValue() + "\"" +
> " stored=" +
> z.isStored() + "/ indexed=" +
> z.isIndexed() + "/ tokenized=" +
> z.isTokenized());
> }
> ps.println();
> }
> }
> else
> already.add( u);
> }
> if ( nd > 0)
> o.println( "\t\tdup=" + nf.format( nd));
> if ( stale > 0)
> o.println( "\t\tstale=" + nf.format( stale));
> ps.close();
> r.close();
> o.println( "before opt");
> IndexWriter writer = new IndexWriter( foo,
> IndexBase.getAnalyzer(),
> false);
> writer.optimize();
> o.println( "after opt");
> writer.close();
> o.println( "after close");
> }
>
>
> static PrintStream o = System.out;
>
>
> }
>
>
> >
> > Thanks,
> > Kashif.
> >
> >
> >
> > __________________________________________________
> > Do You Yahoo!?
> > Tired of spam? Yahoo! Mail has the best spam
> protection around
> > http://mail.yahoo.com
> >
> >
>
---------------------------------------------------------------------
> > To unsubscribe, e-mail:
> java-dev-unsubscribe@lucene.apache.org
> > For additional commands, e-mail:
> java-dev-help@lucene.apache.org
> >
>
>
>
---------------------------------------------------------------------
> To unsubscribe, e-mail:
> java-dev-unsubscribe@lucene.apache.org
> For additional commands, e-mail:
> java-dev-help@lucene.apache.org
>
>
__________________________________________________
Do You Yahoo!?
Tired of spam? Yahoo! Mail has the best spam protection around
http://mail.yahoo.com
---------------------------------------------------------------------
To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-dev-help@lucene.apache.org
Re: duplicates from multiple index
Posted by David Spencer <da...@tropo.com>.
Kashif Khadim wrote:
> Hi ,
>
> I have multiple index of lucene and want know how can
> i delete duplicates from these index. I am using
> MultiSearcher to search on these. I have duplicates
> "urls" in these index, any sample code or tool will
> be a big help.
Here's some ancient code that I've used - consider it a "fragment" i.e.
it won't compile as it needs a couple of other classes, but it should be
obvious what's missing (.e.g. DFields.URL is something like "url").
package com.tropo.lucene;
import org.apache.lucene.analysis.*;
import org.apache.lucene.index.*;
import org.apache.lucene.document.*;
import org.apache.lucene.analysis.standard.*;
import java.io.*;
import java.util.*;
import com.tropo.lang.*;
/**
*
*/
public class DupUrls
extends com.tropo.TRBase
{
public static void main(String[] args)
throws Throwable
{
boolean show = false;
if ( args.length > 0)
{
for ( int i = 0; i< args.length; i++)
{
if ( args[ i].equals( "-s"))
{
show = true;
continue;
}
if (show)
showdups( args[ i]);
else
dedup( args[ i]);
}
System.exit( 0);
}
String[] ar = new File( ".").list();
for ( int ii = 0; ii < ar.length; ii++)
{
if ( ar[ ii].endsWith( "_index") &&
new File( ar[ ii]).isDirectory())
{
dedup( ar[ ii]);
}
}
System.exit( 0);
}
/**
*
*/
private static void showdups( String foo)
throws Throwable
{
IndexReader r = IndexReader.open( foo);
Set already = new HashSet();
int num = r.numDocs();
for ( int i = 0; i < num; i++)
{
if ( r.isDeleted( i)) continue;
Document d = r.document( i);
String u = d.get( DFields.URL);
if ( already.contains( u))
o.println( u);
else
already.add( u);
}
r.close();
}
/**
*
*/
private static void dedup( String foo)
throws Throwable
{
IndexReader r = IndexReader.open( foo);
int num = r.numDocs();
o.println( foo + ": " + num);
Set already = new HashSet();
int nd = 0;
int stale = 0;
PrintStream ps = new PrintStream( new FileOutputStream( foo + ".txt"));
ps.println( "File: " + foo);
ps.println( "Docs: " + nf.format( num));
ps.println();
for ( int i = 0; i < num; i++)
{
if ( r.isDeleted( i))
{
stale++;
continue;
}
Document d = r.document( i);
String u = d.get( DFields.URL);
if ( already.contains( u))
{
r.delete( i);
nd++;
ps.println( u);
if ( false)
{
Enumeration e = d.fields();
while ( e.hasMoreElements())
{
Field z = (Field) e.nextElement();
ps.println( "\t\t\tname=" + z.name() + " sv=" +
"\"" + z.toString() + "\"" +
"\"" + z.stringValue() + "\"" +
" stored=" +
z.isStored() + "/ indexed=" +
z.isIndexed() + "/ tokenized=" +
z.isTokenized());
}
ps.println();
}
}
else
already.add( u);
}
if ( nd > 0)
o.println( "\t\tdup=" + nf.format( nd));
if ( stale > 0)
o.println( "\t\tstale=" + nf.format( stale));
ps.close();
r.close();
o.println( "before opt");
IndexWriter writer = new IndexWriter( foo, IndexBase.getAnalyzer(),
false);
writer.optimize();
o.println( "after opt");
writer.close();
o.println( "after close");
}
static PrintStream o = System.out;
}
>
> Thanks,
> Kashif.
>
>
>
> __________________________________________________
> Do You Yahoo!?
> Tired of spam? Yahoo! Mail has the best spam protection around
> http://mail.yahoo.com
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
> For additional commands, e-mail: java-dev-help@lucene.apache.org
>
---------------------------------------------------------------------
To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-dev-help@lucene.apache.org