You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@lucene.apache.org by Robert Muir <rc...@gmail.com> on 2009/11/22 16:22:49 UTC

Re: svn commit: r883088 - in /lucene/java/branches/flex_1458/src/java/org/apache/lucene/index: TermRef.java codecs/standard/StandardTermsDictReader.java

was this why i saw strange benchmark results?

On Sun, Nov 22, 2009 at 9:52 AM, <mi...@apache.org> wrote:

> Author: mikemccand
> Date: Sun Nov 22 14:52:02 2009
> New Revision: 883088
>
> URL: http://svn.apache.org/viewvc?rev=883088&view=rev
> Log:
> LUCENE-1458 (on flex branch): small optimization to terms dict cache: don't
> store redundant TermRef
>
> Modified:
>
>  lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermRef.java
>
>  lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictReader.java
>
> Modified:
> lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermRef.java
> URL:
> http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermRef.java?rev=883088&r1=883087&r2=883088&view=diff
>
> ==============================================================================
> ---
> lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermRef.java
> (original)
> +++
> lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermRef.java
> Sun Nov 22 14:52:02 2009
> @@ -36,6 +36,8 @@
>     copy(text);
>   }
>
> +  // nocommit: we could do this w/ UnicodeUtil w/o requiring
> +  // allocation of new bytes[]?
>   public void copy(String text) {
>     try {
>       bytes = text.getBytes("UTF-8");
>
> Modified:
> lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictReader.java
> URL:
> http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictReader.java?rev=883088&r1=883087&r2=883088&view=diff
>
> ==============================================================================
> ---
> lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictReader.java
> (original)
> +++
> lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictReader.java
> Sun Nov 22 14:52:02 2009
> @@ -304,6 +304,7 @@
>       public SeekStatus seek(TermRef term) throws IOException {
>         ReuseLRUCache<TermRef, CacheEntry> cache = null;
>         CacheEntry entry = null;
> +        TermRef entryKey = null;
>
>         if (docs.canCaptureState()) {
>           final ThreadResources resources = getThreadResources();
> @@ -312,7 +313,7 @@
>           entry = cache.get(term);
>           if (entry != null) {
>             docFreq = entry.freq;
> -            bytesReader.term.copy(entry.term);
> +            bytesReader.term.copy(term);
>             docs.setState(entry, docFreq);
>             termUpto = entry.termUpTo;
>             // nocommit -- would be better to do this lazy?
> @@ -384,16 +385,17 @@
>                 entry = cache.eldest;
>                 cache.eldest = null;
>                 docs.captureState(entry);
> -                entry.term.copy(bytesReader.term);
> +                entryKey = cache.eldestKey;
> +                entryKey.copy(bytesReader.term);
>               } else {
>                 entry = docs.captureState(null);
> -                entry.term = (TermRef) bytesReader.term.clone();
> +                entryKey = (TermRef) bytesReader.term.clone();
>               }
>               entry.freq = docFreq;
>               entry.termUpTo = termUpto;
>               entry.filePointer = in.getFilePointer();
>
> -              cache.put(entry.term, entry);
> +              cache.put(entryKey, entry);
>             }
>             return SeekStatus.FOUND;
>           } else if (cmp > 0) {
> @@ -517,9 +519,8 @@
>
>   // nocommit -- scrutinize API
>   public static class CacheEntry {
> -    int termUpTo;
> -    TermRef term; // nocommit -- really needed?
> -    long filePointer;
> +    int termUpTo;                                 // ord for this term
> +    long filePointer;                             // fp into the terms
> dict primary file (_X.tis)
>
>     // nocommit -- belongs in Pulsing's CacheEntry class:
>     public int freq;
> @@ -563,6 +564,7 @@
>     private final static float LOADFACTOR = 0.75f;
>     private int cacheSize;
>     V eldest;
> +    K eldestKey;
>
>     /**
>      * Creates a last-recently-used cache with the specified size.
> @@ -580,6 +582,7 @@
>       boolean remove = size() > ReuseLRUCache.this.cacheSize;
>       if (remove) {
>         this.eldest = eldest.getValue();
> +        this.eldestKey = eldest.getKey();
>       }
>       return remove;
>     }
>
>
>


-- 
Robert Muir
rcmuir@gmail.com

Re: svn commit: r883088 - in /lucene/java/branches/flex_1458/src/java/org/apache/lucene/index: TermRef.java codecs/standard/StandardTermsDictReader.java

Posted by Mark Miller <ma...@gmail.com>.
bq. merging down every so often seems manageable so far (Mark?).

Yeah, this has been working great from my perspective.

Michael McCandless wrote:
> I think you should keep doing all LUCENE-1606 work (and, any other
> issues) on trunk, and then we merge down to flex branch once it's
> committed?
>
> We shouldn't hold up any trunk features because flex is
> coming... 
>
> I'm hoping to finish flex soonish -- largely what remains (I think!)
> is better testing (correctness & performance) of the 4-way
> combinations.  I think the codecs approach is generally working
> well.. the fact that we have initial Pulsing & PforDelta codecs
> working is great.
>
> Mike
>
> On Sun, Nov 22, 2009 at 1:11 PM, Robert Muir <rc...@gmail.com> wrote:
>   
>> Mike, I guess what I am implying is should i even bother with lucene-1606
>> and trunk?
>>
>> or instead, should i be helping you, looking at TermsEnum, and working on
>> integrating it into flex?
>>
>> On Sun, Nov 22, 2009 at 1:05 PM, Michael McCandless
>> <lu...@mikemccandless.com> wrote:
>>     
>>> On Sun, Nov 22, 2009 at 11:31 AM, Robert Muir <rc...@gmail.com> wrote:
>>>
>>>       
>>>>> No, not really... just an optimization I found when hunting ;)
>>>>>
>>>>> I'm working now on an AutomatonTermsEnum that uses the flex API
>>>>> directly, to test that performance.
>>>>>
>>>>>           
>>>> I didn't mean to 'bail out' on this
>>>>         
>>> You didn't 'bail out'; I 'bailed in' ;)  This is the joy of open
>>> source... great big noisy Bazaar.
>>>
>>>       
>>>> but I could not tell if TermsEnum was close to stabilized
>>>>         
>>> I think it's close; we need to do this port anyway, once automaton is
>>> committed to trunk, so really I saved Mark some work ;)
>>>
>>>       
>>>> and it might be significant work to convert it?
>>>>         
>>> It wasn't too bad, but maybe you can look it over once I post patch
>>> and see if I messed anything up :)
>>>
>>>       
>>>> Maybe benching numeric range would be easier and accomplish the same
>>>> thing?
>>>>         
>>> Yeah benching NRQ would be good too... many benchmarks still to run.
>>>
>>> Mike
>>>
>>> ---------------------------------------------------------------------
>>> To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
>>> For additional commands, e-mail: java-dev-help@lucene.apache.org
>>>
>>>       
>>
>> --
>> Robert Muir
>> rcmuir@gmail.com
>>
>>     
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
> For additional commands, e-mail: java-dev-help@lucene.apache.org
>
>   


---------------------------------------------------------------------
To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-dev-help@lucene.apache.org


Re: svn commit: r883088 - in /lucene/java/branches/flex_1458/src/java/org/apache/lucene/index: TermRef.java codecs/standard/StandardTermsDictReader.java

Posted by Michael McCandless <lu...@mikemccandless.com>.
On Sun, Nov 22, 2009 at 4:19 PM, Robert Muir <rc...@gmail.com> wrote:

>> What places specifically are you worried about?
>
> places like AutomatonQuery, where I found myself wanting to consider the
> option of processing byte[], when I know this is very bad!

Ahh OK :)  Well you got the better of yourself before it was too late!

Mike

---------------------------------------------------------------------
To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-dev-help@lucene.apache.org


Re: svn commit: r883088 - in /lucene/java/branches/flex_1458/src/java/org/apache/lucene/index: TermRef.java codecs/standard/StandardTermsDictReader.java

Posted by Robert Muir <rc...@gmail.com>.
On Sun, Nov 22, 2009 at 4:16 PM, Michael McCandless <
lucene@mikemccandless.com> wrote:

> On Sun, Nov 22, 2009 at 4:06 PM, Robert Muir <rc...@gmail.com> wrote:
> > I guess here is where I just say that unicode and java are optimized for
> > utf-16 processing
>
> I agree, though leaving things as UTF8 works fine for low level stuff
> (sorting, comparing equality, etc.)?
>

+1


>
> > and so while I agree with byte[] being available in
> > places like this for flex indexing,
> > I'm already nervous about seeing code / optimizations that only work well
> > with latin-1, and are very slow / buggy for anything else.
>
> Buggy we should clearly outright fix.
>
> Slower, maybe.  But very slow, I hope not?
>
> What places specifically are you worried about?
>

places like AutomatonQuery, where I found myself wanting to consider the
option of processing byte[], when I know this is very bad!


> Mike
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
> For additional commands, e-mail: java-dev-help@lucene.apache.org
>
>


-- 
Robert Muir
rcmuir@gmail.com

Re: svn commit: r883088 - in /lucene/java/branches/flex_1458/src/java/org/apache/lucene/index: TermRef.java codecs/standard/StandardTermsDictReader.java

Posted by Michael McCandless <lu...@mikemccandless.com>.
On Sun, Nov 22, 2009 at 4:06 PM, Robert Muir <rc...@gmail.com> wrote:
> I guess here is where I just say that unicode and java are optimized for
> utf-16 processing

I agree, though leaving things as UTF8 works fine for low level stuff
(sorting, comparing equality, etc.)?

> and so while I agree with byte[] being available in
> places like this for flex indexing,
> I'm already nervous about seeing code / optimizations that only work well
> with latin-1, and are very slow / buggy for anything else.

Buggy we should clearly outright fix.

Slower, maybe.  But very slow, I hope not?

What places specifically are you worried about?

Mike

---------------------------------------------------------------------
To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-dev-help@lucene.apache.org


Re: svn commit: r883088 - in /lucene/java/branches/flex_1458/src/java/org/apache/lucene/index: TermRef.java codecs/standard/StandardTermsDictReader.java

Posted by Robert Muir <rc...@gmail.com>.
I guess here is where I just say that unicode and java are optimized for
utf-16 processing, and so while I agree with byte[] being available in
places like this for flex indexing,
I'm already nervous about seeing code / optimizations that only work well
with latin-1, and are very slow / buggy for anything else.

On Sun, Nov 22, 2009 at 3:58 PM, Michael McCandless <
lucene@mikemccandless.com> wrote:

> On Sun, Nov 22, 2009 at 3:52 PM, Robert Muir <rc...@gmail.com> wrote:
> >
> > On Sun, Nov 22, 2009 at 3:50 PM, Michael McCandless
> > <lu...@mikemccandless.com> wrote:
> >>
> >> Yeah I think there will be lots of optimizing we can do, after flex
> lands.
> >>
> >> Maybe stick w/ String for now?  But open an issue, today, to remind us
> >> to cutover to char[] post-flex?
> >
> > ok, i'll create one.
>
> Thanks.
>
> >> Doing all processing in UTF8 is tantalizing too ;)  This would mean no
> >> conversion of the terms data on iterating from the terms dict...
> >
> > lets please not go this route :) its gonna be enough trouble fixing the
> > char[]-based code for unicode 4, forget about byte[]
>
> I'll defer to you ;)
>
> Mike
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
> For additional commands, e-mail: java-dev-help@lucene.apache.org
>
>


-- 
Robert Muir
rcmuir@gmail.com

Re: svn commit: r883088 - in /lucene/java/branches/flex_1458/src/java/org/apache/lucene/index: TermRef.java codecs/standard/StandardTermsDictReader.java

Posted by Michael McCandless <lu...@mikemccandless.com>.
On Sun, Nov 22, 2009 at 3:52 PM, Robert Muir <rc...@gmail.com> wrote:
>
> On Sun, Nov 22, 2009 at 3:50 PM, Michael McCandless
> <lu...@mikemccandless.com> wrote:
>>
>> Yeah I think there will be lots of optimizing we can do, after flex lands.
>>
>> Maybe stick w/ String for now?  But open an issue, today, to remind us
>> to cutover to char[] post-flex?
>
> ok, i'll create one.

Thanks.

>> Doing all processing in UTF8 is tantalizing too ;)  This would mean no
>> conversion of the terms data on iterating from the terms dict...
>
> lets please not go this route :) its gonna be enough trouble fixing the
> char[]-based code for unicode 4, forget about byte[]

I'll defer to you ;)

Mike

---------------------------------------------------------------------
To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-dev-help@lucene.apache.org


Re: svn commit: r883088 - in /lucene/java/branches/flex_1458/src/java/org/apache/lucene/index: TermRef.java codecs/standard/StandardTermsDictReader.java

Posted by Robert Muir <rc...@gmail.com>.
On Sun, Nov 22, 2009 at 3:50 PM, Michael McCandless <
lucene@mikemccandless.com> wrote:

> Yeah I think there will be lots of optimizing we can do, after flex lands.
>
> Maybe stick w/ String for now?  But open an issue, today, to remind us
> to cutover to char[] post-flex?
>

ok, i'll create one.


>
> Doing all processing in UTF8 is tantalizing too ;)  This would mean no
> conversion of the terms data on iterating from the terms dict...
>

lets please not go this route :) its gonna be enough trouble fixing the
char[]-based code for unicode 4, forget about byte[]

>
> Mike
>
>
-- 
Robert Muir
rcmuir@gmail.com

Re: svn commit: r883088 - in /lucene/java/branches/flex_1458/src/java/org/apache/lucene/index: TermRef.java codecs/standard/StandardTermsDictReader.java

Posted by Michael McCandless <lu...@mikemccandless.com>.
Yeah I think there will be lots of optimizing we can do, after flex lands.

Maybe stick w/ String for now?  But open an issue, today, to remind us
to cutover to char[] post-flex?

Doing all processing in UTF8 is tantalizing too ;)  This would mean no
conversion of the terms data on iterating from the terms dict...

Mike

On Sun, Nov 22, 2009 at 1:56 PM, Robert Muir <rc...@gmail.com> wrote:
> ok, I only ask because some rework of this enum could be necessary to take
> advantage of the new api.
>
> examples include changing it to use char[] (easy) to prevent lots of string
> creation, which was unavoidable with TermEnum since it is based on string.
>
> i will never mention this again, but it could also run on byte[] pretty
> easily.
> However I think high-level processing like this should use utf-16
> processing, as java intended, although I'm pretty positive it would be
> extremely fast.
>
> On Sun, Nov 22, 2009 at 1:33 PM, Michael McCandless
> <lu...@mikemccandless.com> wrote:
>>
>> I think you should keep doing all LUCENE-1606 work (and, any other
>> issues) on trunk, and then we merge down to flex branch once it's
>> committed?
>>
>> We shouldn't hold up any trunk features because flex is
>> coming... merging down every so often seems manageable so far (Mark?).
>>
>> I'm hoping to finish flex soonish -- largely what remains (I think!)
>> is better testing (correctness & performance) of the 4-way
>> combinations.  I think the codecs approach is generally working
>> well.. the fact that we have initial Pulsing & PforDelta codecs
>> working is great.
>>
>> Mike
>>
>> On Sun, Nov 22, 2009 at 1:11 PM, Robert Muir <rc...@gmail.com> wrote:
>> > Mike, I guess what I am implying is should i even bother with
>> > lucene-1606
>> > and trunk?
>> >
>> > or instead, should i be helping you, looking at TermsEnum, and working
>> > on
>> > integrating it into flex?
>> >
>> > On Sun, Nov 22, 2009 at 1:05 PM, Michael McCandless
>> > <lu...@mikemccandless.com> wrote:
>> >>
>> >> On Sun, Nov 22, 2009 at 11:31 AM, Robert Muir <rc...@gmail.com> wrote:
>> >>
>> >> >> No, not really... just an optimization I found when hunting ;)
>> >> >>
>> >> >> I'm working now on an AutomatonTermsEnum that uses the flex API
>> >> >> directly, to test that performance.
>> >> >>
>> >> >
>> >> > I didn't mean to 'bail out' on this
>> >>
>> >> You didn't 'bail out'; I 'bailed in' ;)  This is the joy of open
>> >> source... great big noisy Bazaar.
>> >>
>> >> > but I could not tell if TermsEnum was close to stabilized
>> >>
>> >> I think it's close; we need to do this port anyway, once automaton is
>> >> committed to trunk, so really I saved Mark some work ;)
>> >>
>> >> > and it might be significant work to convert it?
>> >>
>> >> It wasn't too bad, but maybe you can look it over once I post patch
>> >> and see if I messed anything up :)
>> >>
>> >> > Maybe benching numeric range would be easier and accomplish the same
>> >> > thing?
>> >>
>> >> Yeah benching NRQ would be good too... many benchmarks still to run.
>> >>
>> >> Mike
>> >>
>> >> ---------------------------------------------------------------------
>> >> To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
>> >> For additional commands, e-mail: java-dev-help@lucene.apache.org
>> >>
>> >
>> >
>> >
>> > --
>> > Robert Muir
>> > rcmuir@gmail.com
>> >
>>
>> ---------------------------------------------------------------------
>> To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
>> For additional commands, e-mail: java-dev-help@lucene.apache.org
>>
>
>
>
> --
> Robert Muir
> rcmuir@gmail.com
>

---------------------------------------------------------------------
To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-dev-help@lucene.apache.org


Re: svn commit: r883088 - in /lucene/java/branches/flex_1458/src/java/org/apache/lucene/index: TermRef.java codecs/standard/StandardTermsDictReader.java

Posted by Robert Muir <rc...@gmail.com>.
ok, I only ask because some rework of this enum could be necessary to take
advantage of the new api.

examples include changing it to use char[] (easy) to prevent lots of string
creation, which was unavoidable with TermEnum since it is based on string.

i will never mention this again, but it could also run on byte[] pretty
easily.
However I think high-level processing like this should use utf-16
processing, as java intended, although I'm pretty positive it would be
extremely fast.

On Sun, Nov 22, 2009 at 1:33 PM, Michael McCandless <
lucene@mikemccandless.com> wrote:

> I think you should keep doing all LUCENE-1606 work (and, any other
> issues) on trunk, and then we merge down to flex branch once it's
> committed?
>
> We shouldn't hold up any trunk features because flex is
> coming... merging down every so often seems manageable so far (Mark?).
>
> I'm hoping to finish flex soonish -- largely what remains (I think!)
> is better testing (correctness & performance) of the 4-way
> combinations.  I think the codecs approach is generally working
> well.. the fact that we have initial Pulsing & PforDelta codecs
> working is great.
>
> Mike
>
> On Sun, Nov 22, 2009 at 1:11 PM, Robert Muir <rc...@gmail.com> wrote:
> > Mike, I guess what I am implying is should i even bother with lucene-1606
> > and trunk?
> >
> > or instead, should i be helping you, looking at TermsEnum, and working on
> > integrating it into flex?
> >
> > On Sun, Nov 22, 2009 at 1:05 PM, Michael McCandless
> > <lu...@mikemccandless.com> wrote:
> >>
> >> On Sun, Nov 22, 2009 at 11:31 AM, Robert Muir <rc...@gmail.com> wrote:
> >>
> >> >> No, not really... just an optimization I found when hunting ;)
> >> >>
> >> >> I'm working now on an AutomatonTermsEnum that uses the flex API
> >> >> directly, to test that performance.
> >> >>
> >> >
> >> > I didn't mean to 'bail out' on this
> >>
> >> You didn't 'bail out'; I 'bailed in' ;)  This is the joy of open
> >> source... great big noisy Bazaar.
> >>
> >> > but I could not tell if TermsEnum was close to stabilized
> >>
> >> I think it's close; we need to do this port anyway, once automaton is
> >> committed to trunk, so really I saved Mark some work ;)
> >>
> >> > and it might be significant work to convert it?
> >>
> >> It wasn't too bad, but maybe you can look it over once I post patch
> >> and see if I messed anything up :)
> >>
> >> > Maybe benching numeric range would be easier and accomplish the same
> >> > thing?
> >>
> >> Yeah benching NRQ would be good too... many benchmarks still to run.
> >>
> >> Mike
> >>
> >> ---------------------------------------------------------------------
> >> To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
> >> For additional commands, e-mail: java-dev-help@lucene.apache.org
> >>
> >
> >
> >
> > --
> > Robert Muir
> > rcmuir@gmail.com
> >
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
> For additional commands, e-mail: java-dev-help@lucene.apache.org
>
>


-- 
Robert Muir
rcmuir@gmail.com

Re: svn commit: r883088 - in /lucene/java/branches/flex_1458/src/java/org/apache/lucene/index: TermRef.java codecs/standard/StandardTermsDictReader.java

Posted by Michael McCandless <lu...@mikemccandless.com>.
I think you should keep doing all LUCENE-1606 work (and, any other
issues) on trunk, and then we merge down to flex branch once it's
committed?

We shouldn't hold up any trunk features because flex is
coming... merging down every so often seems manageable so far (Mark?).

I'm hoping to finish flex soonish -- largely what remains (I think!)
is better testing (correctness & performance) of the 4-way
combinations.  I think the codecs approach is generally working
well.. the fact that we have initial Pulsing & PforDelta codecs
working is great.

Mike

On Sun, Nov 22, 2009 at 1:11 PM, Robert Muir <rc...@gmail.com> wrote:
> Mike, I guess what I am implying is should i even bother with lucene-1606
> and trunk?
>
> or instead, should i be helping you, looking at TermsEnum, and working on
> integrating it into flex?
>
> On Sun, Nov 22, 2009 at 1:05 PM, Michael McCandless
> <lu...@mikemccandless.com> wrote:
>>
>> On Sun, Nov 22, 2009 at 11:31 AM, Robert Muir <rc...@gmail.com> wrote:
>>
>> >> No, not really... just an optimization I found when hunting ;)
>> >>
>> >> I'm working now on an AutomatonTermsEnum that uses the flex API
>> >> directly, to test that performance.
>> >>
>> >
>> > I didn't mean to 'bail out' on this
>>
>> You didn't 'bail out'; I 'bailed in' ;)  This is the joy of open
>> source... great big noisy Bazaar.
>>
>> > but I could not tell if TermsEnum was close to stabilized
>>
>> I think it's close; we need to do this port anyway, once automaton is
>> committed to trunk, so really I saved Mark some work ;)
>>
>> > and it might be significant work to convert it?
>>
>> It wasn't too bad, but maybe you can look it over once I post patch
>> and see if I messed anything up :)
>>
>> > Maybe benching numeric range would be easier and accomplish the same
>> > thing?
>>
>> Yeah benching NRQ would be good too... many benchmarks still to run.
>>
>> Mike
>>
>> ---------------------------------------------------------------------
>> To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
>> For additional commands, e-mail: java-dev-help@lucene.apache.org
>>
>
>
>
> --
> Robert Muir
> rcmuir@gmail.com
>

---------------------------------------------------------------------
To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-dev-help@lucene.apache.org


Re: svn commit: r883088 - in /lucene/java/branches/flex_1458/src/java/org/apache/lucene/index: TermRef.java codecs/standard/StandardTermsDictReader.java

Posted by Robert Muir <rc...@gmail.com>.
Mike, I guess what I am implying is should i even bother with lucene-1606
and trunk?

or instead, should i be helping you, looking at TermsEnum, and working on
integrating it into flex?

On Sun, Nov 22, 2009 at 1:05 PM, Michael McCandless <
lucene@mikemccandless.com> wrote:

> On Sun, Nov 22, 2009 at 11:31 AM, Robert Muir <rc...@gmail.com> wrote:
>
> >> No, not really... just an optimization I found when hunting ;)
> >>
> >> I'm working now on an AutomatonTermsEnum that uses the flex API
> >> directly, to test that performance.
> >>
> >
> > I didn't mean to 'bail out' on this
>
> You didn't 'bail out'; I 'bailed in' ;)  This is the joy of open
> source... great big noisy Bazaar.
>
> > but I could not tell if TermsEnum was close to stabilized
>
> I think it's close; we need to do this port anyway, once automaton is
> committed to trunk, so really I saved Mark some work ;)
>
> > and it might be significant work to convert it?
>
> It wasn't too bad, but maybe you can look it over once I post patch
> and see if I messed anything up :)
>
> > Maybe benching numeric range would be easier and accomplish the same
> thing?
>
> Yeah benching NRQ would be good too... many benchmarks still to run.
>
> Mike
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
> For additional commands, e-mail: java-dev-help@lucene.apache.org
>
>


-- 
Robert Muir
rcmuir@gmail.com

Re: svn commit: r883088 - in /lucene/java/branches/flex_1458/src/java/org/apache/lucene/index: TermRef.java codecs/standard/StandardTermsDictReader.java

Posted by Michael McCandless <lu...@mikemccandless.com>.
On Sun, Nov 22, 2009 at 11:31 AM, Robert Muir <rc...@gmail.com> wrote:

>> No, not really... just an optimization I found when hunting ;)
>>
>> I'm working now on an AutomatonTermsEnum that uses the flex API
>> directly, to test that performance.
>>
>
> I didn't mean to 'bail out' on this

You didn't 'bail out'; I 'bailed in' ;)  This is the joy of open
source... great big noisy Bazaar.

> but I could not tell if TermsEnum was close to stabilized

I think it's close; we need to do this port anyway, once automaton is
committed to trunk, so really I saved Mark some work ;)

> and it might be significant work to convert it?

It wasn't too bad, but maybe you can look it over once I post patch
and see if I messed anything up :)

> Maybe benching numeric range would be easier and accomplish the same thing?

Yeah benching NRQ would be good too... many benchmarks still to run.

Mike

---------------------------------------------------------------------
To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-dev-help@lucene.apache.org


Re: svn commit: r883088 - in /lucene/java/branches/flex_1458/src/java/org/apache/lucene/index: TermRef.java codecs/standard/StandardTermsDictReader.java

Posted by Robert Muir <rc...@gmail.com>.
On Sun, Nov 22, 2009 at 11:23 AM, Michael McCandless <
lucene@mikemccandless.com> wrote:

> No, not really... just an optimization I found when hunting ;)
>
> I'm working now on an AutomatonTermsEnum that uses the flex API
> directly, to test that performance.
>
>
I didn't mean to 'bail out' on this but I could not tell if TermsEnum was
close to stabilized, and it might be significant work to convert it?
Maybe benching numeric range would be easier and accomplish the same thing?

-- 
Robert Muir
rcmuir@gmail.com

Re: svn commit: r883088 - in /lucene/java/branches/flex_1458/src/java/org/apache/lucene/index: TermRef.java codecs/standard/StandardTermsDictReader.java

Posted by Michael McCandless <lu...@mikemccandless.com>.
No, not really... just an optimization I found when hunting ;)

I'm working now on an AutomatonTermsEnum that uses the flex API
directly, to test that performance.

One of the major challenges with flex is the 4-way testing required.
Ie, you can have a non-flex or flex index, and then you can access it
via non-flex or flex API.  All 4 are allowed, and must work (for
back-compat).

I'm most concerned about performance of flex API on top of flex index,
since that's the future, but not hurting performance of the other 3 is
also important.

Mike

On Sun, Nov 22, 2009 at 10:22 AM, Robert Muir <rc...@gmail.com> wrote:
> was this why i saw strange benchmark results?
>
> On Sun, Nov 22, 2009 at 9:52 AM, <mi...@apache.org> wrote:
>>
>> Author: mikemccand
>> Date: Sun Nov 22 14:52:02 2009
>> New Revision: 883088
>>
>> URL: http://svn.apache.org/viewvc?rev=883088&view=rev
>> Log:
>> LUCENE-1458 (on flex branch): small optimization to terms dict cache:
>> don't store redundant TermRef
>>
>> Modified:
>>
>>  lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermRef.java
>>
>>  lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictReader.java
>>
>> Modified:
>> lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermRef.java
>> URL:
>> http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermRef.java?rev=883088&r1=883087&r2=883088&view=diff
>>
>> ==============================================================================
>> ---
>> lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermRef.java
>> (original)
>> +++
>> lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermRef.java
>> Sun Nov 22 14:52:02 2009
>> @@ -36,6 +36,8 @@
>>     copy(text);
>>   }
>>
>> +  // nocommit: we could do this w/ UnicodeUtil w/o requiring
>> +  // allocation of new bytes[]?
>>   public void copy(String text) {
>>     try {
>>       bytes = text.getBytes("UTF-8");
>>
>> Modified:
>> lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictReader.java
>> URL:
>> http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictReader.java?rev=883088&r1=883087&r2=883088&view=diff
>>
>> ==============================================================================
>> ---
>> lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictReader.java
>> (original)
>> +++
>> lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictReader.java
>> Sun Nov 22 14:52:02 2009
>> @@ -304,6 +304,7 @@
>>       public SeekStatus seek(TermRef term) throws IOException {
>>         ReuseLRUCache<TermRef, CacheEntry> cache = null;
>>         CacheEntry entry = null;
>> +        TermRef entryKey = null;
>>
>>         if (docs.canCaptureState()) {
>>           final ThreadResources resources = getThreadResources();
>> @@ -312,7 +313,7 @@
>>           entry = cache.get(term);
>>           if (entry != null) {
>>             docFreq = entry.freq;
>> -            bytesReader.term.copy(entry.term);
>> +            bytesReader.term.copy(term);
>>             docs.setState(entry, docFreq);
>>             termUpto = entry.termUpTo;
>>             // nocommit -- would be better to do this lazy?
>> @@ -384,16 +385,17 @@
>>                 entry = cache.eldest;
>>                 cache.eldest = null;
>>                 docs.captureState(entry);
>> -                entry.term.copy(bytesReader.term);
>> +                entryKey = cache.eldestKey;
>> +                entryKey.copy(bytesReader.term);
>>               } else {
>>                 entry = docs.captureState(null);
>> -                entry.term = (TermRef) bytesReader.term.clone();
>> +                entryKey = (TermRef) bytesReader.term.clone();
>>               }
>>               entry.freq = docFreq;
>>               entry.termUpTo = termUpto;
>>               entry.filePointer = in.getFilePointer();
>>
>> -              cache.put(entry.term, entry);
>> +              cache.put(entryKey, entry);
>>             }
>>             return SeekStatus.FOUND;
>>           } else if (cmp > 0) {
>> @@ -517,9 +519,8 @@
>>
>>   // nocommit -- scrutinize API
>>   public static class CacheEntry {
>> -    int termUpTo;
>> -    TermRef term; // nocommit -- really needed?
>> -    long filePointer;
>> +    int termUpTo;                                 // ord for this term
>> +    long filePointer;                             // fp into the terms
>> dict primary file (_X.tis)
>>
>>     // nocommit -- belongs in Pulsing's CacheEntry class:
>>     public int freq;
>> @@ -563,6 +564,7 @@
>>     private final static float LOADFACTOR = 0.75f;
>>     private int cacheSize;
>>     V eldest;
>> +    K eldestKey;
>>
>>     /**
>>      * Creates a last-recently-used cache with the specified size.
>> @@ -580,6 +582,7 @@
>>       boolean remove = size() > ReuseLRUCache.this.cacheSize;
>>       if (remove) {
>>         this.eldest = eldest.getValue();
>> +        this.eldestKey = eldest.getKey();
>>       }
>>       return remove;
>>     }
>>
>>
>
>
>
> --
> Robert Muir
> rcmuir@gmail.com
>

---------------------------------------------------------------------
To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-dev-help@lucene.apache.org