You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@lucene.apache.org by Kashif Khadim <ka...@yahoo.com> on 2005/08/04 01:17:06 UTC

duplicates from multiple index

Hi ,

I have multiple index of lucene and want know how can
i delete duplicates from these index. I am using
MultiSearcher to search on these. I have duplicates
"urls" in these index, any sample code  or tool will
be a big help.

Thanks,
Kashif.



__________________________________________________
Do You Yahoo!?
Tired of spam?  Yahoo! Mail has the best spam protection around 
http://mail.yahoo.com 

---------------------------------------------------------------------
To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-dev-help@lucene.apache.org


Re: duplicates from multiple index

Posted by Kashif Khadim <ka...@yahoo.com>.
Hi David,

It works very well and thanks a lot for your help.


Kashif

--- David Spencer <da...@tropo.com> wrote:

> Kashif Khadim wrote:
> 
> > Hi ,
> > 
> > I have multiple index of lucene and want know how
> can
> > i delete duplicates from these index. I am using
> > MultiSearcher to search on these. I have
> duplicates
> > "urls" in these index, any sample code  or tool
> will
> > be a big help.
> 
> Here's some ancient code that I've used - consider
> it a "fragment" i.e. 
> it won't compile as it needs a couple of other
> classes, but it should be 
> obvious what's missing (.e.g. DFields.URL is
> something like "url").
> 
> 
> 
> package com.tropo.lucene;
> 
> import org.apache.lucene.analysis.*;
> import org.apache.lucene.index.*;
> import org.apache.lucene.document.*;
> import org.apache.lucene.analysis.standard.*;
> 
> import java.io.*;
> import java.util.*;
> import com.tropo.lang.*;
> 
> /**
>   *
>   */
> public class DupUrls
> 	extends com.tropo.TRBase
> {
> 	public static void main(String[] args)
> 		throws Throwable
> 	{
> 		boolean show = false;
> 		if ( args.length > 0)
> 		{
> 			for ( int i = 0; i< args.length; i++)
> 			{
> 				if ( args[ i].equals( "-s"))
> 				{
> 					show = true;
> 					continue;
> 				}
> 				if (show)
> 					showdups( args[ i]);
> 				else
> 					dedup( args[ i]);				
> 			}
> 			System.exit( 0);
> 		}
> 		String[] ar = new File( ".").list();
> 		for ( int ii = 0; ii < ar.length; ii++)
> 		{
> 			if ( ar[ ii].endsWith( "_index") &&
> 				 new File( ar[ ii]).isDirectory())
> 			{
> 				dedup( ar[ ii]);
> 			}
> 		}
> 		System.exit( 0);		
> 	}
> 
> 	/**
> 	 *
> 	 */
> 	private static void showdups( String foo)
> 		throws Throwable
> 	{
> 		IndexReader r = IndexReader.open( foo);
> 		Set already = new HashSet();
> 		int num = r.numDocs();
> 		for ( int i = 0; i < num; i++)
> 		{
> 			if ( r.isDeleted( i)) continue;
> 			Document d = r.document( i);
> 			String u = d.get( DFields.URL);
> 			if ( already.contains( u))
> 				o.println( u);
> 			else
> 				already.add( u);
> 		}
> 		r.close();
> 	}
> 
> 	/**
> 	 *
> 	 */
> 	private static void dedup( String foo)
> 		throws Throwable
> 	{
> 		IndexReader r = IndexReader.open( foo);
> 		int num = r.numDocs();
> 		o.println( foo + ": " + num);
> 		Set already = new HashSet();
> 		int nd = 0;
> 		int stale = 0;
> 		PrintStream ps = new PrintStream( new
> FileOutputStream( foo + ".txt"));
> 		ps.println( "File: " + foo);
> 		ps.println( "Docs: " + nf.format( num));
> 		ps.println();
> 		for ( int i = 0; i < num; i++)
> 		{
> 
> 			if ( r.isDeleted( i))
> 			{
> 				stale++;
> 				continue;
> 			}
> 			Document d = r.document( i);			
> 			String u = d.get( DFields.URL);
> 			if ( already.contains( u))
> 			{
> 				r.delete( i);
> 				nd++;
> 				ps.println( u);
> 				if ( false)
> 				{
> 					Enumeration e = d.fields();
> 					while ( e.hasMoreElements())
> 					{
> 						Field z = (Field) e.nextElement();
> 						ps.println( "\t\t\tname=" + z.name() + " sv="
> +
> 									"\"" + z.toString() + "\"" +
> 									"\"" + z.stringValue() + "\"" +
> 									" stored=" +
> 									z.isStored() + "/ indexed=" +
> 									z.isIndexed() + "/ tokenized=" +
> 									z.isTokenized());
> 					}
> 					ps.println();
> 				}
> 			}
> 			else
> 				already.add( u);
> 		}
> 		if ( nd > 0)
> 			o.println( "\t\tdup=" + nf.format( nd));
> 		if ( stale > 0)
> 			o.println( "\t\tstale=" + nf.format( stale));
> 		ps.close();
> 		r.close();
> 		o.println( "before opt");
> 		IndexWriter writer = new IndexWriter( foo,
> IndexBase.getAnalyzer(), 
> false);
> 		writer.optimize();
> 		o.println( "after opt");
> 		writer.close();
> 		o.println( "after close");		
> 	}
> 
> 
> 	static PrintStream o = System.out;
> 
> 
> }
> 
> 
> > 
> > Thanks,
> > Kashif.
> > 
> > 
> > 
> > __________________________________________________
> > Do You Yahoo!?
> > Tired of spam?  Yahoo! Mail has the best spam
> protection around 
> > http://mail.yahoo.com 
> > 
> >
>
---------------------------------------------------------------------
> > To unsubscribe, e-mail:
> java-dev-unsubscribe@lucene.apache.org
> > For additional commands, e-mail:
> java-dev-help@lucene.apache.org
> > 
> 
> 
>
---------------------------------------------------------------------
> To unsubscribe, e-mail:
> java-dev-unsubscribe@lucene.apache.org
> For additional commands, e-mail:
> java-dev-help@lucene.apache.org
> 
> 


__________________________________________________
Do You Yahoo!?
Tired of spam?  Yahoo! Mail has the best spam protection around 
http://mail.yahoo.com 

---------------------------------------------------------------------
To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-dev-help@lucene.apache.org


Re: duplicates from multiple index

Posted by David Spencer <da...@tropo.com>.
Kashif Khadim wrote:

> Hi ,
> 
> I have multiple index of lucene and want know how can
> i delete duplicates from these index. I am using
> MultiSearcher to search on these. I have duplicates
> "urls" in these index, any sample code  or tool will
> be a big help.

Here's some ancient code that I've used - consider it a "fragment" i.e. 
it won't compile as it needs a couple of other classes, but it should be 
obvious what's missing (.e.g. DFields.URL is something like "url").



package com.tropo.lucene;

import org.apache.lucene.analysis.*;
import org.apache.lucene.index.*;
import org.apache.lucene.document.*;
import org.apache.lucene.analysis.standard.*;

import java.io.*;
import java.util.*;
import com.tropo.lang.*;

/**
  *
  */
public class DupUrls
	extends com.tropo.TRBase
{
	public static void main(String[] args)
		throws Throwable
	{
		boolean show = false;
		if ( args.length > 0)
		{
			for ( int i = 0; i< args.length; i++)
			{
				if ( args[ i].equals( "-s"))
				{
					show = true;
					continue;
				}
				if (show)
					showdups( args[ i]);
				else
					dedup( args[ i]);				
			}
			System.exit( 0);
		}
		String[] ar = new File( ".").list();
		for ( int ii = 0; ii < ar.length; ii++)
		{
			if ( ar[ ii].endsWith( "_index") &&
				 new File( ar[ ii]).isDirectory())
			{
				dedup( ar[ ii]);
			}
		}
		System.exit( 0);		
	}

	/**
	 *
	 */
	private static void showdups( String foo)
		throws Throwable
	{
		IndexReader r = IndexReader.open( foo);
		Set already = new HashSet();
		int num = r.numDocs();
		for ( int i = 0; i < num; i++)
		{
			if ( r.isDeleted( i)) continue;
			Document d = r.document( i);
			String u = d.get( DFields.URL);
			if ( already.contains( u))
				o.println( u);
			else
				already.add( u);
		}
		r.close();
	}

	/**
	 *
	 */
	private static void dedup( String foo)
		throws Throwable
	{
		IndexReader r = IndexReader.open( foo);
		int num = r.numDocs();
		o.println( foo + ": " + num);
		Set already = new HashSet();
		int nd = 0;
		int stale = 0;
		PrintStream ps = new PrintStream( new FileOutputStream( foo + ".txt"));
		ps.println( "File: " + foo);
		ps.println( "Docs: " + nf.format( num));
		ps.println();
		for ( int i = 0; i < num; i++)
		{

			if ( r.isDeleted( i))
			{
				stale++;
				continue;
			}
			Document d = r.document( i);			
			String u = d.get( DFields.URL);
			if ( already.contains( u))
			{
				r.delete( i);
				nd++;
				ps.println( u);
				if ( false)
				{
					Enumeration e = d.fields();
					while ( e.hasMoreElements())
					{
						Field z = (Field) e.nextElement();
						ps.println( "\t\t\tname=" + z.name() + " sv=" +
									"\"" + z.toString() + "\"" +
									"\"" + z.stringValue() + "\"" +
									" stored=" +
									z.isStored() + "/ indexed=" +
									z.isIndexed() + "/ tokenized=" +
									z.isTokenized());
					}
					ps.println();
				}
			}
			else
				already.add( u);
		}
		if ( nd > 0)
			o.println( "\t\tdup=" + nf.format( nd));
		if ( stale > 0)
			o.println( "\t\tstale=" + nf.format( stale));
		ps.close();
		r.close();
		o.println( "before opt");
		IndexWriter writer = new IndexWriter( foo, IndexBase.getAnalyzer(), 
false);
		writer.optimize();
		o.println( "after opt");
		writer.close();
		o.println( "after close");		
	}


	static PrintStream o = System.out;


}


> 
> Thanks,
> Kashif.
> 
> 
> 
> __________________________________________________
> Do You Yahoo!?
> Tired of spam?  Yahoo! Mail has the best spam protection around 
> http://mail.yahoo.com 
> 
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
> For additional commands, e-mail: java-dev-help@lucene.apache.org
> 


---------------------------------------------------------------------
To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-dev-help@lucene.apache.org