You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by mi...@apache.org on 2010/02/12 11:16:33 UTC

svn commit: r909334 - in /lucene/java/branches/lucene_2_9: ./ docs/ src/java/org/apache/lucene/index/ src/site/src/documentation/content/xdocs/

Author: mikemccand
Date: Fri Feb 12 10:16:33 2010
New Revision: 909334

URL: http://svn.apache.org/viewvc?rev=909334&view=rev
Log:
LUCENE-2257: improve max per-segment term count limit from ~2.1B to ~274B

Modified:
    lucene/java/branches/lucene_2_9/CHANGES.txt
    lucene/java/branches/lucene_2_9/docs/fileformats.html
    lucene/java/branches/lucene_2_9/docs/fileformats.pdf
    lucene/java/branches/lucene_2_9/src/java/org/apache/lucene/index/SegmentTermEnum.java
    lucene/java/branches/lucene_2_9/src/java/org/apache/lucene/index/TermInfosReader.java
    lucene/java/branches/lucene_2_9/src/site/src/documentation/content/xdocs/fileformats.xml

Modified: lucene/java/branches/lucene_2_9/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/branches/lucene_2_9/CHANGES.txt?rev=909334&r1=909333&r2=909334&view=diff
==============================================================================
--- lucene/java/branches/lucene_2_9/CHANGES.txt (original)
+++ lucene/java/branches/lucene_2_9/CHANGES.txt Fri Feb 12 10:16:33 2010
@@ -27,6 +27,10 @@
  * LUCENE-2158: At high indexing rates, NRT reader could temporarily
    lose deletions.  (Mike McCandless)
   
+ * LUCENE-2257: Increase max number of unique terms in one segment to
+   termIndexInterval (default 128) * ~2.1 billion = ~274 billion.
+   (Tom Burton-West via Mike McCandless)
+  
 API Changes
 
  * LUCENE-2182: DEFAULT_ATTRIBUTE_FACTORY was failing to load

Modified: lucene/java/branches/lucene_2_9/docs/fileformats.html
URL: http://svn.apache.org/viewvc/lucene/java/branches/lucene_2_9/docs/fileformats.html?rev=909334&r1=909333&r2=909334&view=diff
==============================================================================
--- lucene/java/branches/lucene_2_9/docs/fileformats.html (original)
+++ lucene/java/branches/lucene_2_9/docs/fileformats.html Fri Feb 12 10:16:33 2010
@@ -2547,11 +2547,12 @@
 <div class="section">
 <p>
 	      When referring to term numbers, Lucene's current
-	      implementation uses a Java <span class="codefrag">int</span>, which means
-	      the maximum number of unique terms in any single index
-	      segment is 2,147,483,648.  This is technically not a
-	      limitation of the index file format, just of Lucene's
-	      current implementation.
+	      implementation uses a Java <span class="codefrag">int</span> to hold the
+	      term index, which means the maximum number of unique
+	      terms in any single index segment is ~2.1 billion times
+	      the term index interval (default 128) = ~274 billion.
+	      This is technically not a limitation of the index file
+	      format, just of Lucene's current implementation.
 	    </p>
 <p>
 	      Similarly, Lucene uses a Java <span class="codefrag">int</span> to refer

Modified: lucene/java/branches/lucene_2_9/docs/fileformats.pdf
URL: http://svn.apache.org/viewvc/lucene/java/branches/lucene_2_9/docs/fileformats.pdf?rev=909334&r1=909333&r2=909334&view=diff
==============================================================================
--- lucene/java/branches/lucene_2_9/docs/fileformats.pdf (original)
+++ lucene/java/branches/lucene_2_9/docs/fileformats.pdf Fri Feb 12 10:16:33 2010
@@ -667,10 +667,10 @@
 >>
 endobj
 111 0 obj
-<< /Length 1801 /Filter [ /ASCII85Decode /FlateDecode ]
+<< /Length 1845 /Filter [ /ASCII85Decode /FlateDecode ]
  >>
 stream
-Gatm<>E@c%'Ro4HjK13fG\lPL<L(^M)=;1H]aSa"3di^?5e;fsk/+q5qDnr@j5@/#>)aB'A"VfQS_N=$kG,NXdH;*]f';idMD;tC&-V:5k;U.'/tp@-$LTH7^Q,1p?N)YM*ArPAGr!8(0!,&.e*ZcV$F&^;F6Ba=Bj-erc@iWHmO",>o\fW2U#KJ8%,]-0<;4\qcMW)dW(W4r')L'%gC?hm*f`HWnV%dbVZG#-_jUj#'P\O_oZ'&jIQr]%54&Rn#K1_F/7O2B740fG=lc[4Ro3YmceBq7Jn539+CM0NV+[Zoel+/JaFIhGpmP4I%\.R[ZC5-ce+e*fLJ4fAT\@3Qg<OUkg%TUHnGP,C=+#_'.pZ4H",NEsLTNJM(fBH(?KeXfo^Bc8fTuro@"@"D!KU38XjWeP#<ha-pVbZZknj=14l:`^lpdt5U*+CR:Rd:o1k!7K+CnUk"W%n9HHE4gBNuJ8`C+?h=TFac<%Jbl7LF%';\'KIXPn!+dA)t3eiE2m_tS#TS7_cUE%q?XTk5a=alQPQGf:S;O8pJoArGIg8D1^=@"9Xra$]:!Qj.`rXSB_"RPF;J`A:g<&58dgZ<6Y60G+Xe$CU3W``ZHtZ,gKj2a.5`>Gc1u-7W7h"cbCre?E8TM&At<U83)97qa[m#f:qI#Ki-S6k-)'9O'n)hi>8O!?\Zsm0q0)\(>&=We,.h1o`pZE7qOk1rFOCj3(%1n;J"B0i&M[BePZMTY8(,j_P+]_P5I?\J/(K8Z6DjQ_^H`SEO/<c%UDPoC1LgWl98*ZQ1]Y35)SuSu#ZX`icHJ?a0aBM$o43HuJb;OMG;$MM6UhF!JK?Ak#AJZ[A84JW*RL^jn2O7"f!cRE`_?KbQ6?b_61\#p$X"5_iJeHa38glO9^pJ:2R^=5R/3#XE]"\+C,=$RjI@3i)9d>Y">"_S/9Z@,81;O;nec;K.Sh35ViII!^N'h`d&Fa!oTbnAF-)SW:(h*^RW!ZkFM
 nf4=9B6Til(FqYcoO?tniTMb>_+7;LbV\NtQAdON4g_="n/dHr.9:I`img>pmXGCoL.5dT;_%U,hKC-2."mArlK#2QtTjDcU^OZijUGX1BS&fM_IqtMc4YW2pchD7rNX.bkiX:CO+B#XE_l!(M&o8,(lC&:VFoWIf2JGO'0#O'r5)PBdF3_]8*R*2#c0Gb-m"MMVl"<Q\)h\G'())cK.N`6a-%YMn%^1Ye$lp!7*.*f0%_Tkd\nG$RNi()\E3I*dq/R_VI+tT[^l@#-T'\c[&GE1s;J9gn?>4r1ri%&PJjQZZUW)F>7c#SGN$u)DUp-W7j\O%/N"t%)+rC6*J5b&i/rg?gf)qQ>0oC<>?l$4CDGTap@Wn1#_`e#([onhm(7l/DLQ].K%&"i%@qog]SY54_K@#]&\CdSEi2a6_A-\ptml007OI2_pGssbFA@6[laqdrV2`R)'V"YX4fS*\3f>,V>4#L01afe)u\m0--fn,sQ)-]FFs,o^4H.qb\/+2kb`6l+lrN;Y4Gh8JS#)F1%g,CT;[NS:Jn>_(cEbKaGProLPjEZK-n>=?sW2b%F]e/'&0&"]e\7%IOAG78TS[D<B6J_lP+`S*fEMr*BXh*5qda[qM%Kd&rep2">N8C+WmlWBfB"=LHK?[dUc'ujV5J\KMr?G`JG;6k+/'3XN&F5`o:W;koaM4seWJhHt0u4mEMOr5Bo>0`iiqk,6HmeKLq4s5I7W&`*]&'g1*_%_OUd!0NMK,:Za1Fo%!78.6-^dpe7i'q*bg[A:V/te//^*dZeQmd3b+3">0,"]'IfX.'6n&~>
+Gatm<>BAg]'RnB3n/LcAW02O@J/Pq745JTKec65F#:5KGZ*oni-#N@,hi;M#d?,Yd>@9L+Eb+E>^2oQXAs9^1ai=_X<r-kr8MS(g'LhkJ&@T?O5+sCo9('qO5>Q0'^&@dco^idiB%[<2Vm-.F?@O`+.)LNo/%e)!DF@c9H;iPf\X6.,28n**]:&i5-/YXjesG6chX''KWD??0@/BgBgPg/i+("o=j<n96JHW2\NCtH4;(--Hcu8[Ho3me+HoMW^Nj.ai3S6_S8e,ZT<EZQ[*"p'`+#gT(P2urT+CM6TM?VH5S#$TYAAL$dI)Tj*"^5eHhXIk!>2NX(P0'ODl8@)CV8'q/0JJ'qT,2dZj@eRhl2)eG5hql,[U'&GA5g]tBkWd"ht*C]:.X,IJD:$Y8*\o"9@*4#=aI`*i2[+QM-hWkQX:B7;*(I#aIE4Y-7]I@hlm_I=Ys^3AIVYCG^U?e;aZ/$Qjg2,^d?/3#mjXsjHcBla:tk5Zc1Wp9=8F?5!XlfGL:G5q0gNL#ZR+WL!h;L)l;MY.r[`SLg6)`G/k_:L[KZnU*72gF,.XiE3`m5#7f1@JS.WIo@efS#26'!,qgh`+lfY'KqWP4WnNG$TdmPA&P1bNXJEpZ\B5_g1KufB?AQgcV/oL3562M1$jDUZ1WCYN83`gae1`4ETh\sT\c[KpZmarCa$tR*@=Y;>DfdgNV-pfL=BX>945KHrX\,OeJpGQrNP66?Z4=7n(UJ]F!FJr.#+@(,[$AS,(A(1%\_5*=<PGsSO*;ZEEi+/(;=T%W.k=A%=B:L<*HkrMe:Q"K!o(Z8M"5fTp5;nc@?lk,O^Rbo0TFml-2!Z@S",OMUdbOmPgq&m$kpp(/s\/TK^i\n,l!8D/"_jCY\W;Bet,j)"#4k9Y,V3e.jkl?8C,=oI"U('Om*36V"Gb`8+F[?+[^`ihp$Zpfdkg:i#!bSJpjj$%L?d`6_#Kp=0B4+LVMVirQss;"@\*>SV2g
 op=(.1,XC28DF:0,;X?n:1j]'Wb.SUFW*)2i<_u...@-.LpP>=ki2,O,5a/`l#J;k\mu>9-Z2"<uSKA$uSF8t(>"?/*82.L2pc7M[%J+,^%=DgZ1dXbF=T#uc3+nuNVAp&*oI>jo`AQCR[td7;TqaI0,q(ah.jdG(2!m_[[pb3OpT`#>323Wj*LEW=Faj\2j3$aQbKW?iCt:Ss^4Pb5'-*c,geVDUOr:]L=E^(LLf<IncuH^"q'WVVKi%n&-Fh)tc^hYHL#BWt#t'qHTpGo9I+%V4UGf_^XO^[]a%=lBBh/Y$0oc[M^NB,OJ?GXh<)u6NY6Y$KXhG`+:!W*djgcNqfbL0&mZ^ps9@Co["2nl!!W~>
 endstream
 endobj
 112 0 obj
@@ -1206,80 +1206,80 @@
 xref
 0 153
 0000000000 65535 f 
-0000054446 00000 n 
-0000054652 00000 n 
-0000054745 00000 n 
+0000054490 00000 n 
+0000054696 00000 n 
+0000054789 00000 n 
 0000000015 00000 n 
 0000000071 00000 n 
 0000001333 00000 n 
 0000001453 00000 n 
 0000001639 00000 n 
-0000054897 00000 n 
+0000054941 00000 n 
 0000001774 00000 n 
-0000054960 00000 n 
+0000055004 00000 n 
 0000001909 00000 n 
-0000055026 00000 n 
+0000055070 00000 n 
 0000002046 00000 n 
-0000055090 00000 n 
+0000055134 00000 n 
 0000002183 00000 n 
-0000055156 00000 n 
+0000055200 00000 n 
 0000002320 00000 n 
-0000055222 00000 n 
+0000055266 00000 n 
 0000002457 00000 n 
-0000055288 00000 n 
+0000055332 00000 n 
 0000002594 00000 n 
-0000055352 00000 n 
+0000055396 00000 n 
 0000002731 00000 n 
-0000055416 00000 n 
+0000055460 00000 n 
 0000002868 00000 n 
-0000055482 00000 n 
+0000055526 00000 n 
 0000003005 00000 n 
-0000055547 00000 n 
+0000055591 00000 n 
 0000003142 00000 n 
-0000055613 00000 n 
+0000055657 00000 n 
 0000003279 00000 n 
-0000055679 00000 n 
+0000055723 00000 n 
 0000003416 00000 n 
-0000055744 00000 n 
+0000055788 00000 n 
 0000003553 00000 n 
-0000055810 00000 n 
+0000055854 00000 n 
 0000003690 00000 n 
-0000055874 00000 n 
+0000055918 00000 n 
 0000003826 00000 n 
-0000055938 00000 n 
+0000055982 00000 n 
 0000003963 00000 n 
-0000056004 00000 n 
+0000056048 00000 n 
 0000004100 00000 n 
-0000056070 00000 n 
+0000056114 00000 n 
 0000004237 00000 n 
-0000056135 00000 n 
+0000056179 00000 n 
 0000004373 00000 n 
-0000056201 00000 n 
+0000056245 00000 n 
 0000004510 00000 n 
-0000056265 00000 n 
+0000056309 00000 n 
 0000004647 00000 n 
-0000056331 00000 n 
+0000056375 00000 n 
 0000004783 00000 n 
-0000056397 00000 n 
+0000056441 00000 n 
 0000004920 00000 n 
 0000005673 00000 n 
 0000005796 00000 n 
 0000005872 00000 n 
-0000056461 00000 n 
+0000056505 00000 n 
 0000006004 00000 n 
-0000056527 00000 n 
+0000056571 00000 n 
 0000006137 00000 n 
-0000056591 00000 n 
+0000056635 00000 n 
 0000006270 00000 n 
-0000056656 00000 n 
+0000056700 00000 n 
 0000006403 00000 n 
-0000056721 00000 n 
+0000056765 00000 n 
 0000006536 00000 n 
-0000056786 00000 n 
+0000056830 00000 n 
 0000006669 00000 n 
-0000056851 00000 n 
+0000056895 00000 n 
 0000006801 00000 n 
-0000056916 00000 n 
+0000056960 00000 n 
 0000006934 00000 n 
 0000009083 00000 n 
 0000009191 00000 n 
@@ -1317,47 +1317,47 @@
 0000043725 00000 n 
 0000045456 00000 n 
 0000045566 00000 n 
-0000047461 00000 n 
-0000056981 00000 n 
-0000047571 00000 n 
-0000047771 00000 n 
-0000047989 00000 n 
-0000048195 00000 n 
-0000048403 00000 n 
-0000048571 00000 n 
-0000048771 00000 n 
-0000048929 00000 n 
-0000049104 00000 n 
-0000049367 00000 n 
-0000049608 00000 n 
-0000049737 00000 n 
-0000049891 00000 n 
-0000050045 00000 n 
-0000050189 00000 n 
-0000050339 00000 n 
-0000050480 00000 n 
-0000050715 00000 n 
-0000050910 00000 n 
-0000051150 00000 n 
-0000051332 00000 n 
-0000051505 00000 n 
-0000051708 00000 n 
-0000051896 00000 n 
-0000052148 00000 n 
-0000052289 00000 n 
-0000052498 00000 n 
-0000052684 00000 n 
-0000052858 00000 n 
-0000053103 00000 n 
-0000053294 00000 n 
-0000053500 00000 n 
-0000053666 00000 n 
-0000053780 00000 n 
-0000053891 00000 n 
-0000054003 00000 n 
-0000054112 00000 n 
-0000054219 00000 n 
-0000054336 00000 n 
+0000047505 00000 n 
+0000057025 00000 n 
+0000047615 00000 n 
+0000047815 00000 n 
+0000048033 00000 n 
+0000048239 00000 n 
+0000048447 00000 n 
+0000048615 00000 n 
+0000048815 00000 n 
+0000048973 00000 n 
+0000049148 00000 n 
+0000049411 00000 n 
+0000049652 00000 n 
+0000049781 00000 n 
+0000049935 00000 n 
+0000050089 00000 n 
+0000050233 00000 n 
+0000050383 00000 n 
+0000050524 00000 n 
+0000050759 00000 n 
+0000050954 00000 n 
+0000051194 00000 n 
+0000051376 00000 n 
+0000051549 00000 n 
+0000051752 00000 n 
+0000051940 00000 n 
+0000052192 00000 n 
+0000052333 00000 n 
+0000052542 00000 n 
+0000052728 00000 n 
+0000052902 00000 n 
+0000053147 00000 n 
+0000053338 00000 n 
+0000053544 00000 n 
+0000053710 00000 n 
+0000053824 00000 n 
+0000053935 00000 n 
+0000054047 00000 n 
+0000054156 00000 n 
+0000054263 00000 n 
+0000054380 00000 n 
 trailer
 <<
 /Size 153
@@ -1365,5 +1365,5 @@
 /Info 4 0 R
 >>
 startxref
-57035
+57079
 %%EOF

Modified: lucene/java/branches/lucene_2_9/src/java/org/apache/lucene/index/SegmentTermEnum.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/lucene_2_9/src/java/org/apache/lucene/index/SegmentTermEnum.java?rev=909334&r1=909333&r2=909334&view=diff
==============================================================================
--- lucene/java/branches/lucene_2_9/src/java/org/apache/lucene/index/SegmentTermEnum.java (original)
+++ lucene/java/branches/lucene_2_9/src/java/org/apache/lucene/index/SegmentTermEnum.java Fri Feb 12 10:16:33 2010
@@ -108,7 +108,7 @@
     return clone;
   }
 
-  final void seek(long pointer, int p, Term t, TermInfo ti)
+  final void seek(long pointer, long p, Term t, TermInfo ti)
           throws IOException {
     input.seek(pointer);
     position = p;

Modified: lucene/java/branches/lucene_2_9/src/java/org/apache/lucene/index/TermInfosReader.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/lucene_2_9/src/java/org/apache/lucene/index/TermInfosReader.java?rev=909334&r1=909333&r2=909334&view=diff
==============================================================================
--- lucene/java/branches/lucene_2_9/src/java/org/apache/lucene/index/TermInfosReader.java (original)
+++ lucene/java/branches/lucene_2_9/src/java/org/apache/lucene/index/TermInfosReader.java Fri Feb 12 10:16:33 2010
@@ -170,7 +170,7 @@
 
   private final void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException {
     enumerator.seek(indexPointers[indexOffset],
-                   (indexOffset * totalIndexInterval) - 1,
+                   ((long) indexOffset * totalIndexInterval) - 1,
                    indexTerms[indexOffset], indexInfos[indexOffset]);
   }
 
@@ -241,28 +241,6 @@
     return ti;
   }
 
-  /** Returns the nth term in the set. */
-  final Term get(int position) throws IOException {
-    if (size == 0) return null;
-
-    SegmentTermEnum enumerator = getThreadResources().termEnum;
-    if (enumerator.term() != null &&
-        position >= enumerator.position &&
-	position < (enumerator.position + totalIndexInterval))
-      return scanEnum(enumerator, position);      // can avoid seek
-
-    seekEnum(enumerator, position/totalIndexInterval); // must seek
-    return scanEnum(enumerator, position);
-  }
-
-  private final Term scanEnum(SegmentTermEnum enumerator, int position) throws IOException {
-    while(enumerator.position < position)
-      if (!enumerator.next())
-	return null;
-
-    return enumerator.term();
-  }
-
   private void ensureIndexIsRead() {
     if (indexTerms == null) {
       throw new IllegalStateException("terms index was not loaded when this reader was created");

Modified: lucene/java/branches/lucene_2_9/src/site/src/documentation/content/xdocs/fileformats.xml
URL: http://svn.apache.org/viewvc/lucene/java/branches/lucene_2_9/src/site/src/documentation/content/xdocs/fileformats.xml?rev=909334&r1=909333&r2=909334&view=diff
==============================================================================
--- lucene/java/branches/lucene_2_9/src/site/src/documentation/content/xdocs/fileformats.xml (original)
+++ lucene/java/branches/lucene_2_9/src/site/src/documentation/content/xdocs/fileformats.xml Fri Feb 12 10:16:33 2010
@@ -1845,11 +1845,12 @@
 
             <p>
 	      When referring to term numbers, Lucene's current
-	      implementation uses a Java <code>int</code>, which means
-	      the maximum number of unique terms in any single index
-	      segment is 2,147,483,648.  This is technically not a
-	      limitation of the index file format, just of Lucene's
-	      current implementation.
+	      implementation uses a Java <code>int</code> to hold the
+	      term index, which means the maximum number of unique
+	      terms in any single index segment is ~2.1 billion times
+	      the term index interval (default 128) = ~274 billion.
+	      This is technically not a limitation of the index file
+	      format, just of Lucene's current implementation.
 	    </p>
 	    <p>
 	      Similarly, Lucene uses a Java <code>int</code> to refer