You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-user@lucene.apache.org by Kasun Perera <ka...@opensource.lk> on 2012/06/18 05:18:06 UTC

Calculating Average Document Length with Lucene

I want to calculate average document length for document collection which
each document having 3 different fields(filed1, field2,field3)

This is the program to calculate average length when only one field is
there.

private byte[] normsDocLengthArr = null;
private double avgDocLength;

normsDocLengthArr = indexReader.norms("filed1");

            //norms-Returns the byte-encoded normalization factor for
the named field of every document.
double sumLength = 0;

for (int i = 0; i < normsDocLengthArr.length; i++) {

    double encodeLength = DefaultSimilarity.decodeNorm(normsDocLengthArr[i]);

    //decodeNorm -Decodes a normalization factor stored in an index.

    double length = 1 / (encodeLength * encodeLength);

    sumLength += length;

}

this.avgDocLength = sumLength / normsDocLengthArr.length;

This is how I extended it for all 3 fields.

private byte[] normsDocLengthArrField1 = null;
private byte[] normsDocLengthArrField2 = null;
private byte[] normsDocLengthArrField3 = null;
private double avgDocLength;

normsDocLengthArrField1 = indexReader.norms("filed1");

normsDocLengthArrField2 = indexReader.norms("filed2");

normsDocLengthArrField3 = indexReader.norms("filed3");

            //norms-Returns the byte-encoded normalization factor for
the named field of every document.
double sumLength = 0;

for (int i = 0; i < normsDocLengthArrField1.length; i++) {

    double encodeLengthF1 =
DefaultSimilarity.decodeNorm(normsDocLengthArrField1[i]);

    double encodeLengthF2 =
DefaultSimilarity.decodeNorm(normsDocLengthArrField2[i]);

    double encodeLengthF3 =
DefaultSimilarity.decodeNorm(normsDocLengthArrField3[i]);

    //decodeNorm -Decodes a normalization factor stored in an index.

    double length = 1 / {(encodeLengthF1 *
encodeLengthF1)+(encodeLengthF2 * encodeLengthF2)+(encodeLengthF3 *
encodeLengthF3)};

    sumLength += length;

}

this.avgDocLength = sumLength / (normsDocLengthArrField1.length+
normsDocLengthArrField2.length+normsDocLengthArrField3.length;

I just want to know whether my implementation of calculating Doc average
length for 3 field is correct?

-- 
Regards

Kasun Perera

Re: Calculating Average Document Length with Lucene

Posted by Kasun Perera <ka...@opensource.lk>.
I found this is the correct way of calculating Average Document length of
document having tree fields

byte[] normsDocLengthArrField1 = indexReader.norms("filed1");
byte[] normsDocLengthArrField2 = indexReader.norms("filed2");
byte[] normsDocLengthArrField3 = indexReader.norms("filed3");

 double sumLength = 0;
        for (int i = 0; i < normsDocLengthArrField1.length; i++) {
            double encodeLengthFOne =
DefaultSimilarity.decodeNorm(normsDocLengthArrField1[i]);
            double encodeLengthFTwo =
DefaultSimilarity.decodeNorm(normsDocLengthArrField2[i]);
            double encodeLengthFThree =
DefaultSimilarity.decodeNorm(normsDocLengthArrField3[i]);

            //decodeNorm -Decodes a normalization factor stored in an index.
            double lengthFieldOne = 1 / (encodeLengthFOne *
encodeLengthFOne);
            double lengthFieldTwo = 1 / (encodeLengthFTwo *
encodeLengthFTwo);
            double lengthFieldThree = 1 / (encodeLengthFThree *
encodeLengthFThree);
            sumLength += lengthFieldOne + lengthFieldTwo + lengthFieldThree;

        }
        this.avgDocLength = sumLength / (normsDocLengthArrField1.length);

Thanks

On Mon, Jun 18, 2012 at 8:48 AM, Kasun Perera <ka...@opensource.lk> wrote:

>  I want to calculate average document length for document collection which
> each document having 3 different fields(filed1, field2,field3)
>
> This is the program to calculate average length when only one field is
> there.
>
> private byte[] normsDocLengthArr = null;
>
> private double avgDocLength;
>
>
> normsDocLengthArr = indexReader.norms("filed1");
>
>
>             //norms-Returns the byte-encoded normalization factor for the named field of every document.
>
> double sumLength = 0;
>
>
> for (int i = 0; i < normsDocLengthArr.length; i++) {
>
>
>     double encodeLength = DefaultSimilarity.decodeNorm(normsDocLengthArr[i]);
>
>
>     //decodeNorm -Decodes a normalization factor stored in an index.
>
>
>     double length = 1 / (encodeLength * encodeLength);
>
>
>     sumLength += length;
>
>
> }
>
>
> this.avgDocLength = sumLength / normsDocLengthArr.length;
>
>  This is how I extended it for all 3 fields.
>
> private byte[] normsDocLengthArrField1 = null;
>
> private byte[] normsDocLengthArrField2 = null;
>
> private byte[] normsDocLengthArrField3 = null;
>
> private double avgDocLength;
>
>
> normsDocLengthArrField1 = indexReader.norms("filed1");
>
>
> normsDocLengthArrField2 = indexReader.norms("filed2");
>
>
> normsDocLengthArrField3 = indexReader.norms("filed3");
>
>
>             //norms-Returns the byte-encoded normalization factor for the named field of every document.
>
> double sumLength = 0;
>
>
> for (int i = 0; i < normsDocLengthArrField1.length; i++) {
>
>
>     double encodeLengthF1 = DefaultSimilarity.decodeNorm(normsDocLengthArrField1[i]);
>
>
>     double encodeLengthF2 = DefaultSimilarity.decodeNorm(normsDocLengthArrField2[i]);
>
>
>     double encodeLengthF3 = DefaultSimilarity.decodeNorm(normsDocLengthArrField3[i]);
>
>
>     //decodeNorm -Decodes a normalization factor stored in an index.
>
>
>     double length = 1 / {(encodeLengthF1 * encodeLengthF1)+(encodeLengthF2 * encodeLengthF2)+(encodeLengthF3 * encodeLengthF3)};
>
>
>     sumLength += length;
>
>
> }
>
>
> this.avgDocLength = sumLength / (normsDocLengthArrField1.length+ normsDocLengthArrField2.length+normsDocLengthArrField3.length;
>
>  I just want to know whether my implementation of calculating Doc average
> length for 3 field is correct?
>
> --
> Regards
>
> Kasun Perera
>
>


-- 
Regards

Kasun Perera