You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2005/07/08 00:18:13 UTC
svn commit: r209663 [1/12] - in /lucene/nutch/branches/mapred: conf/ site/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/parse/ src/java/org/apache/nutch/protocol/ src/java/org/apache/nutch/segment/ src/j...

Author: cutting
Date: Thu Jul  7 15:18:08 2005
New Revision: 209663

URL: http://svn.apache.org/viewcvs?rev=209663&view=rev
Log:
svn merge -r 190963:209656 from trunk

Modified:
    lucene/nutch/branches/mapred/conf/nutch-default.xml
    lucene/nutch/branches/mapred/site/mailing_lists.html
    lucene/nutch/branches/mapred/site/mailing_lists.pdf
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/fetcher/Fetcher.java
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/ParseStatus.java
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ProtocolStatus.java
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/segment/SegmentReader.java
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/tools/ParseSegment.java
    lucene/nutch/branches/mapred/src/plugin/languageidentifier/build.xml
    lucene/nutch/branches/mapred/src/plugin/languageidentifier/plugin.xml
    lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
    lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/NGramProfile.java
    lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/da.ngp
    lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/de.ngp
    lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/el.ngp
    lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/en.ngp
    lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/es.ngp
    lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/fi.ngp
    lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/fr.ngp
    lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/hu.ngp
    lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/it.ngp
    lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/nl.ngp
    lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/pl.ngp
    lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/pt.ngp
    lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/sv.ngp
    lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestNGramProfile.java
    lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
    lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
    lucene/nutch/branches/mapred/src/plugin/parse-js/plugin.xml
    lucene/nutch/branches/mapred/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
    lucene/nutch/branches/mapred/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java
    lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
    lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java
    lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
    lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java
    lucene/nutch/branches/mapred/src/site/src/documentation/content/xdocs/mailing_lists.xml

Modified: lucene/nutch/branches/mapred/conf/nutch-default.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/conf/nutch-default.xml?rev=209663&r1=209662&r2=209663&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/conf/nutch-default.xml (original)
+++ lucene/nutch/branches/mapred/conf/nutch-default.xml Thu Jul  7 15:18:08 2005
@@ -23,6 +23,15 @@
 </property>
 
 <property>
+  <name>http.robots.403.allow</name>
+  <value>true</value>
+  <description>Some servers return HTTP status 403 (Forbidden) if
+  /robots.txt doesn't exist. This should probably mean that we are
+  allowed to crawl the site nonetheless. If this is set to false,
+  then such sites will be treated as forbidden.</description>
+</property>
+
+<property>
   <name>http.agent.description</name>
   <value>Nutch</value>
   <description>Further description of our bot- this text is used in
@@ -745,6 +754,40 @@
   <value>1.0</value>
   <description> Used as a boost for phrase in Lucene query.
   Multiplied by boost for field phrase is matched in.
+  </description>
+</property>
+
+<!-- language-identifier plugin properties -->
+
+<property>
+  <name>lang.ngram.min.length</name>
+  <value>1</value>
+  <description> The minimum size of ngrams to uses to identify
+  language (must be between 1 and lang.ngram.max.length).
+  The larger is the range between lang.ngram.min.length and
+  lang.ngram.max.length, the better is the identification, but
+  the slowest it is.
+  </description>
+</property>
+
+<property>
+  <name>lang.ngram.max.length</name>
+  <value>4</value>
+  <description> The maximum size of ngrams to uses to identify
+  language (must be between lang.ngram.min.length and 4).
+  The larger is the range between lang.ngram.min.length and
+  lang.ngram.max.length, the better is the identification, but
+  the slowest it is.
+  </description>
+</property>
+
+<property>
+  <name>lang.analyze.max.length</name>
+  <value>2048</value>
+  <description> The maximum bytes of data to uses to indentify
+  the language (0 means full content analysis).
+  The larger is this value, the better is the analysis, but the
+  slowest it is.
   </description>
 </property>
 

Modified: lucene/nutch/branches/mapred/site/mailing_lists.html
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/site/mailing_lists.html?rev=209663&r1=209662&r2=209663&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/site/mailing_lists.html (original)
+++ lucene/nutch/branches/mapred/site/mailing_lists.html Thu Jul  7 15:18:08 2005
@@ -200,20 +200,20 @@
 <p>If you use Nutch, please subscribe to the Nutch user mailing list.</p>
 <p>
         The Nutch user mailing list is :
-        <a href="mailto:nutch-user@incubator.apache.org">nutch-user@incubator.apache.org</a>.
+        <a href="mailto:nutch-user@lucene.apache.org">nutch-user@lucene.apache.org</a>.
       </p>
 <ul>
         
 <li>
-<a href="mailto:nutch-user-subscribe@incubator.apache.org">Subscribe to List</a>
+<a href="mailto:nutch-user-subscribe@lucene.apache.org">Subscribe to List</a>
 </li>
         
 <li>
-<a href="mailto:nutch-user-unsubscribe@incubator.apache.org">Unsubscribe from List</a>
+<a href="mailto:nutch-user-unsubscribe@lucene.apache.org">Unsubscribe from List</a>
 </li>
         
 <li>
-<a href="http://incubator.apache.org/mail/nutch-user/">View List Archive</a>
+<a href="http://lucene.apache.org/mail/nutch-user/">View List Archive</a>
 </li>
       
 </ul>
@@ -231,20 +231,20 @@
       Nutch developer mailing list.</p>
 <p>
         The Nutch developer mailing list is :
-        <a href="mailto:nutch-dev@incubator.apache.org">nutch-dev@incubator.apache.org</a>.
+        <a href="mailto:nutch-dev@lucene.apache.org">nutch-dev@lucene.apache.org</a>.
       </p>
 <ul>
         
 <li>
-<a href="mailto:nutch-dev-subscribe@incubator.apache.org">Subscribe to List</a>
+<a href="mailto:nutch-dev-subscribe@lucene.apache.org">Subscribe to List</a>
 </li>
         
 <li>
-<a href="mailto:nutch-dev-unsubscribe@incubator.apache.org">Unsubscribe from List</a>
+<a href="mailto:nutch-dev-unsubscribe@lucene.apache.org">Unsubscribe from List</a>
 </li>
         
 <li>
-<a href="http://incubator.apache.org/mail/nutch-dev/">View List Archive</a>
+<a href="http://lucene.apache.org/mail/nutch-dev/">View List Archive</a>
 </li>
       
 </ul>
@@ -263,15 +263,15 @@
 <ul>
         
 <li>
-<a href="mailto:nutch-commits-subscribe@incubator.apache.org">Subscribe to List</a>
+<a href="mailto:nutch-commits-subscribe@lucene.apache.org">Subscribe to List</a>
 </li>
         
 <li>
-<a href="mailto:nutch-commits-unsubscribe@incubator.apache.org">Unsubscribe from List</a>
+<a href="mailto:nutch-commits-unsubscribe@lucene.apache.org">Unsubscribe from List</a>
 </li>
         
 <li>
-<a href="http://incubator.apache.org/mail/nutch-commits/">View List Archive</a>
+<a href="http://lucene.apache.org/mail/nutch-commits/">View List Archive</a>
 </li>
       
 </ul>
@@ -287,20 +287,20 @@
       about the Nutch crawler.</p>
 <p>
         The Nutch agent mailing list is :
-        <a href="mailto:nutch-agent@incubator.apache.org">nutch-agent@incubator.apache.org</a>.
+        <a href="mailto:nutch-agent@lucene.apache.org">nutch-agent@lucene.apache.org</a>.
       </p>
 <ul>
         
 <li>
-<a href="mailto:nutch-agent-subscribe@incubator.apache.org">Subscribe to List</a>
+<a href="mailto:nutch-agent-subscribe@lucene.apache.org">Subscribe to List</a>
 </li>
         
 <li>
-<a href="mailto:nutch-agent-unsubscribe@incubator.apache.org">Unsubscribe from List</a>
+<a href="mailto:nutch-agent-unsubscribe@lucene.apache.org">Unsubscribe from List</a>
 </li>
         
 <li>
-<a href="http://incubator.apache.org/mail/nutch-agent/">View List Archive</a>
+<a href="http://lucene.apache.org/mail/nutch-agent/">View List Archive</a>
 </li>
       
 </ul>

Modified: lucene/nutch/branches/mapred/site/mailing_lists.pdf
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/site/mailing_lists.pdf?rev=209663&r1=209662&r2=209663&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/site/mailing_lists.pdf (original)
+++ lucene/nutch/branches/mapred/site/mailing_lists.pdf Thu Jul  7 15:18:08 2005
@@ -69,10 +69,10 @@
 >>
 endobj
 16 0 obj
-<< /Length 2391 /Filter [ /ASCII85Decode /FlateDecode ]
+<< /Length 2383 /Filter [ /ASCII85Decode /FlateDecode ]
  >>
 stream
-Gb!l!gN)%,&:N/3n?'u0%1s_-b>XV3l*SoOg.(="h<Du...@0C5r>QK#S?Y/Je0hs)Zc1('r][6eNY[m$OSGY4395XkfsE`-mg"*"g]saTU,CP;uM?cU+m=CHKn%<h5]5d=Cj:bjTgHAQPOBkQhqCcOW:m8["(1KWI;q3\`2mP[_"Go@;OQ,!&5iB%?(:3f@iKlG?<iQPL(pubnEkC$j"3t!2KO(VRgAMTGD8j01M^&:1NC.cE8cRGYYj$"ia\n(rMef*f4AR#;.P]TU$,Ahp^Z.7,?t)ab*1U+[FBVJ;n=3Es$rUW<0J(ZsdJ-.tVWaf.!;ro<erK.qVPT$IhE"'UqJdR#Dar4AoB_4o=[0:fS%0QclW`>1SD:e7H*gA^tBX!Z&ZEcR!!60i)]\a=sV><\"Y"hDB2\'G@09OZ<)[A]A`1!%@b1oZedEXJ9J8.iC0`PR;;dBV2Ptkb]eOjc-_;QTTo,aq\4HLV-6j/JO%%V4f,`X+_N%UP0Ccj?dl"=g)>A&h[]?F7J!mqh;O2==>!%@lmeb+47=!Vj')j&b4hgCqK:42emWC`LSRJY9.f?`#:`+]%9AFHtm>BZXf8ajgo[XLYS^0At*N5dn2Qdg<apOBb"DQPo2AgB"gt\P@OT"iFk1]M9gcN9Cp[LYJB$"SXVaB+H(;PGlL!M5[0Z=,rURc!1M5Ah?V`>][.'Qdoen'3:dF[);p6OHWnT).m'P':bqEp%'j5NLH+rFg9INlK_6NNjMa=QjWQS;;g?ZML!@2"Ij.B8lRHZC=Jf_HM6MeL>(k!3+UDLaN"CDAe<)lZM\)$?/,!.n!SSI:CK7'!l\mjhbT0a<e`Or2JB,CN,2/Q;g)e@u$O[Er6^aLY5sd>[EA>Ti=X4+nb`!..Dj!/U/R/pc$E'js5"S+4UGT/TKd*o\<iIm!UtDiZ8m_H1[P,*4C//M\QK6,@jTb):!H)>40Rke$D_aul6<G0O7,KbBe67)D9GL9#R9UDWOQA:!<eVa*o>[ach:akT&Ujc@jSAI7bq:qM?%A7[-P&=L9\G->Be"G_3-cM/.UZW13(Y]g)CKcK'.S'KWH#L'==>!%0/cRsa%=(As,h;1d"*m=Z%n:e]DQbCX'+&m6BCmR]>?Fj$$p*&=D.N,`;X5(hG1DK2u0I1*1NN-`=$sr&3:r\GCmMf#n$hm@UglE$HhoOb[d/T-K6'U&3#*.RNNY:oE#":$t+aV+d?@LM?GlP1d@E3pTSGAGa`IT&bdD-'MI@R(4oO7&mRVe<H9"'^2Q%=>$*WY:@?21,#NBZ:.Ot"l'?,C2L8OOEC_kNHYLWNDjl$ldWukE-O+o6aqK02H`B=0Mjq,?(/9foeig;7j']DQ[]U1"Lo*V.>Tn\1p<<ljGD.'ZZ`XiO$)_C=[25KAiOi2+MR:[=_TpqCUY_;?H*oG=$p$6B@Y4o#W&CV5'6<0N@KZ=+An^WAOUU4"N&")P&4$R(B8#J1_iAC<B8EP(Be#<&e.C%GcDss=8F;?lm*%=$h"C)sd=?K8,mObRmHP"m-:][>3FEA/NY_$oo*<bB-?k7gKS_KOo\EqGXbAjT<7*OfaW2Q2=[KH0<@Ca,mg_G.Xkf[!`$Hli6VgA6InG!EcV[L8AtNW*/M.-)b`rL=s)+%:*[J]W02?g9q56ijZ<8@$58<F],AMOG6s?H&mY@j\\hY\f,1\<KE-p$?73eEe$lc"RTTgP.U*X!p90M/Eaii38RiqY*l&ZU8jpA^rZ;#d#J-iu=Ip,52=HnQraOREB9cJC3=/PrlS<Oc\P6bZHT:[XGbaZjN$Jk-T6()`^#Tg@X+kV&YM*b2r@=;D:r2`u$*N2bbT%hpO>4H"06f6i3^KZ1,#2@tmC$`IYZp;qX__4fis!!c6j^M?)plbfM!KH,/r<h%H*.XT'e[NTUjU][+\j'.4l]1cT+hOn\YJc2jmDYRW[b'+Qo3M$r0$q]:r1%WUp!sZgG!,&JL:]k*8XU9Rad2.Eq\R$W,Vo~>
+Gb!l!gN)%,&:N/3n?'u0%2%M]:JK97dnZBH)l531V^"j!`2Q-c,\>SWhZ\k[ncBGid\fj[YlJdi3=@CS4dOs-(<cH76[drTX51X8p'^/qn;fVe$Zi$1&aG(qlK%k_6cgkAQ3D[8k#7P2eLTfYBD!Jl":eup68t?1i^[kT%m(g0>b(KF>buc9iEU:2?gI7"0jWi6X7LbI.C4@X<>XllTo[a_.J;o3cAl&Af+Z3+Wej9FRtU)?-B_J:LTa3NV-gtQXP^,qT,0)ao@s#bK)(Xa@I++g6JTXB3F8IN%$-EAiZN95S<&S8(N3A1ZJ7N6^I5jdjBV@0(]2JI]Lh^ZX<6#_6V<5V08'b6+G_7;fu2c*WR\uR.A4:S3^'4;KT9,*&15=+"?L%D@6(8*2]'L[jGQ<VJ9WW2'8,D^76.Y1&hP938F3*(X`==m+]f4N.Mr6*4b,5<mI?/98ui<KC3e?jbNsf^9B8jR$imlj"$i@f&'f].R4;%Vr2H[l5ul<c#tTL/87C>SgO6`QB'LiO$+$cbfNhj%8eT0biki8.l;6R7.Bhk`X%V,`fp%5r/aEH`54N<""Me6m#&:i`Z8e8*G%!C,1s%NVLq3SF#*YJ?JrtUfW1gX:N<_J'e^caqY.EQ5kZV&SchIML$g)^[8XQ$XYe#?g5$!0ZdO,]NQ]%&_PN5)]YQk0?eD0!e!$tPIC8#=M`_KP;gRbKF4C]aS(I4;0S59s!Kt\upL[6`9X"8LS[]'&f]:2/n5J1;T#:6b<-+DA-[o>68asS?ji3DI..#s]CGC!d<PnE%s6cX-A[9Eu@;5[*YHb:oi\75LQINWrL'-6UV)Sn1`9cU!]gEJ[dCaQaX6!\c&mJHS%i087N'DeE$dr*2o&tdUnOh]F5%]7W?$8$9AI1fbfA$Ze#A0c4&g0jf3\1C`1(VsTCa:F>VfZ6u`S%tBoE)3peGcpqMU>4bJ't;X&"n#NQNAb]l;9tgOqV43hUfFfTT>r+!=c(gD3rQ/2IAQWjSJ0_4k(:a7)=fE;VN9uHGd&kU^)rJfKqf.?X"@Y/^C<4_7:&=skM#G)IqX)81<iHsO3^/V3Q$-sA'=F0mfU6MItIGH+;A@VRY']MQ^G"_57.PTdN6K&g>;i(e8%%a>)?+L(YKl`#tQ17\(ONnK,-"Q(.A!23HJ=V_:]KiS#lk2,3!;K-52%TKOHT%oPIDpnuN?g7)GSKC[D:=.l=G0Kd*WTAuKe<Yfs7+6XC0Hp/MmWk1nF/8BtIBA\]A/eEdVVOs,dA;231*+UDL_N"CD9e.HQVMD/$e9kEla)F>@;H*o+9WL^:`SSL_Fm7M0TnHZ7GXbCb93brd^B.g8C?rtZj$[[R&_eFR`5sc."aqu;u(9+([_BWKhGace9Oq<;JSQW?M'WIW^8tF/mS._(D(GJHf^s?Y:\P_Vn][S?T?k=!o/i61!dF1S,o:`*Lp'$G=e8hNcK',"b17+=GV$hp0EY84YUZ?^IB_q9!a=&cV4IF@F6Up<GEB7\(X@m#b7tR+6iKC2T]s0T!o6jeN5I!XqB6hg]HrV'09[AFPKtLLj]as<g_;6Ml7A60TE=j)mAbcVo0e78XEK:<[Rm0;8&.]n0!<<^c0Z2*G9!AXkJf>e8*)2$3fUOO]AfF&^4U<-e-$3W)_($-*Y+D[T4cMR/HoMI:=EO5,gl]?>U?#.CNp8pY/_A?doX+VVOQP-gdZiMC2"um(eMaj2%/)<:-hqND`'<-PR^JI,9[6no3\gVJY?VrL1PeK<diqIt&4$TRAq]B;_2dSZ1Pgj?6*o1s$d`L)?E4E%>%c&ekLs7#hNh>fSKVMPH8GFjK;WFZG)1JN3HJW*!IH$>ZT!P.L+m=2)(%_,+G=Yh6M(^q8$msn`)P,@Q,sSP#\aS-eZ)HAG/b):Xu025Q)9YjOEgKl#2'4JdKuBiB.g8E@#niM(8_Z7LXZ/J;huQLG;DSX9jpPoKdjr>nMHNQ'I!fc5I5*)-rN-F$9OU/R6jc&92oJW<gLW&PkluAO':3qq_Xr6A18@(?hOElhcMYDK!a;Jj[i7Ds3tah29VcKL3!677(mJh8?8i3D/ao*f*4ZJ`0qK"mRipcB]K8M@:-sR8_JVE3h\E<lM3n?%T`o%IIQR#6Wt\&-hKIE2oW<1-N*T9Bam48=W)CX;"mgGD.pIbRJ,fb>1Lth,)l2CEN\[aD`0C'FdL,QA`eV&n=Y,f^r=c)'WQbApfVb2ZX0n[Jfb"VPI"W'ik]1rJ`A]e0)rKjG.GMc3,kQmJQ=j`?_1[*74_rl`NXUWQNfDL)P`;G'W@9B";gI*08@.j*hLPkJ#G6U*=F8^G25EE`+M^0i(\:Ilg`#)=$Nf[2"SF@kM%lO)M5hd7m1Oq.l:#%[!YQV6nd?1-A`AY~>
 endstream
 endobj
 17 0 obj
@@ -103,10 +103,10 @@
 19 0 obj
 << /Type /Annot
 /Subtype /Link
-/Rect [ 241.668 608.466 404.004 596.466 ]
+/Rect [ 241.668 608.466 390.0 596.466 ]
 /C [ 0 0 0 ]
 /Border [ 0 0 0 ]
-/A << /URI (mailto:nutch-user@incubator.apache.org)
+/A << /URI (mailto:nutch-user@lucene.apache.org)
 /S /URI >>
 /H /I
 >>
@@ -117,7 +117,7 @@
 /Rect [ 108.0 578.066 189.336 566.066 ]
 /C [ 0 0 0 ]
 /Border [ 0 0 0 ]
-/A << /URI (mailto:nutch-user-subscribe@incubator.apache.org)
+/A << /URI (mailto:nutch-user-subscribe@lucene.apache.org)
 /S /URI >>
 /H /I
 >>
@@ -128,7 +128,7 @@
 /Rect [ 108.0 564.866 215.988 552.866 ]
 /C [ 0 0 0 ]
 /Border [ 0 0 0 ]
-/A << /URI (mailto:nutch-user-unsubscribe@incubator.apache.org)
+/A << /URI (mailto:nutch-user-unsubscribe@lucene.apache.org)
 /S /URI >>
 /H /I
 >>
@@ -139,7 +139,7 @@
 /Rect [ 108.0 551.666 197.316 539.666 ]
 /C [ 0 0 0 ]
 /Border [ 0 0 0 ]
-/A << /URI (http://incubator.apache.org/mail/nutch-user/)
+/A << /URI (http://lucene.apache.org/mail/nutch-user/)
 /S /URI >>
 /H /I
 >>
@@ -147,10 +147,10 @@
 23 0 obj
 << /Type /Annot
 /Subtype /Link
-/Rect [ 268.992 434.522 428.664 422.522 ]
+/Rect [ 268.992 434.522 414.66 422.522 ]
 /C [ 0 0 0 ]
 /Border [ 0 0 0 ]
-/A << /URI (mailto:nutch-dev@incubator.apache.org)
+/A << /URI (mailto:nutch-dev@lucene.apache.org)
 /S /URI >>
 /H /I
 >>
@@ -161,7 +161,7 @@
 /Rect [ 108.0 404.122 189.336 392.122 ]
 /C [ 0 0 0 ]
 /Border [ 0 0 0 ]
-/A << /URI (mailto:nutch-dev-subscribe@incubator.apache.org)
+/A << /URI (mailto:nutch-dev-subscribe@lucene.apache.org)
 /S /URI >>
 /H /I
 >>
@@ -172,7 +172,7 @@
 /Rect [ 108.0 390.922 215.988 378.922 ]
 /C [ 0 0 0 ]
 /Border [ 0 0 0 ]
-/A << /URI (mailto:nutch-dev-unsubscribe@incubator.apache.org)
+/A << /URI (mailto:nutch-dev-unsubscribe@lucene.apache.org)
 /S /URI >>
 /H /I
 >>
@@ -183,7 +183,7 @@
 /Rect [ 108.0 377.722 197.316 365.722 ]
 /C [ 0 0 0 ]
 /Border [ 0 0 0 ]
-/A << /URI (http://incubator.apache.org/mail/nutch-dev/)
+/A << /URI (http://lucene.apache.org/mail/nutch-dev/)
 /S /URI >>
 /H /I
 >>
@@ -205,7 +205,7 @@
 /Rect [ 108.0 251.378 189.336 239.378 ]
 /C [ 0 0 0 ]
 /Border [ 0 0 0 ]
-/A << /URI (mailto:nutch-commits-subscribe@incubator.apache.org)
+/A << /URI (mailto:nutch-commits-subscribe@lucene.apache.org)
 /S /URI >>
 /H /I
 >>
@@ -216,7 +216,7 @@
 /Rect [ 108.0 238.178 215.988 226.178 ]
 /C [ 0 0 0 ]
 /Border [ 0 0 0 ]
-/A << /URI (mailto:nutch-commits-unsubscribe@incubator.apache.org)
+/A << /URI (mailto:nutch-commits-unsubscribe@lucene.apache.org)
 /S /URI >>
 /H /I
 >>
@@ -227,16 +227,16 @@
 /Rect [ 108.0 224.978 197.316 212.978 ]
 /C [ 0 0 0 ]
 /Border [ 0 0 0 ]
-/A << /URI (http://incubator.apache.org/mail/nutch-commits/)
+/A << /URI (http://lucene.apache.org/mail/nutch-commits/)
 /S /URI >>
 /H /I
 >>
 endobj
 31 0 obj
-<< /Length 858 /Filter [ /ASCII85Decode /FlateDecode ]
+<< /Length 853 /Filter [ /ASCII85Decode /FlateDecode ]
  >>
 stream
-GatU29okbt&A@ZcHqY#)[_ZA7eD7.(<4,`<TtN9GO=VPG$)Bh0eULq9^ioKFZrJa&^qBDRk2GeH=S@UR9HXDQhEi?n0L4WQ!Cn?s;$&oAK&?c\c%8;>#7El!)mc`7Ed,l=(e8;>_c)pBP;LjI@o=/QP<rV0kMS9)5$8s/l]5%EK;XdG,-DG>NsQ+0\jF7?\CcF)>d]tE5`ehb]RmQah0KL`l9#9N$]SUPP#g9Q;Y%FFh-IHtRH6+bLV#nVZTN+>or$bf8M!.cZH4k+hEou@FgJ<'[T?Q?rL/=Gkd-S,&3;QeN*,[.P-_uh6AI7[l-8@:c+Q-FGQa2<_=_ADm.+[+NCi>1<[-2'Lctt-6?q%pgi3TG=HsefbA9jf#?DE3BUYk)d4#V<N'kYUma.t+%BZ8a)ppG[k+i#8SfRCUU1j9WlFIp8'.^!T>7LCiW>0Em<9B0@6X1*qZN!IR>A!2_edJ"40sn_+"F$Pl]l==!JT[q8HCniJ0sr%[MTVQgl9k=e#_M;__qIFoWM+LRD?R:pB7kB>V#P3ib/JpO\)F9QTi]RcE?OF4A_q%/MOh-qr_0'<1%l)]`U?OmpE@1%KK(K_'4;M@BiV!a>%;bDLTh(i+2X4@6\fUcU+t#3[Aq_QqiL7Va\+I$2M;uRj71C\:mU:@D%E&WY)I$52REp_O^XaT(dZs29RWf5Pm:o.oQNltW('urcSi3W$l6%7:Ot%r\.Ku#j:U%RL)[BWa,i@lof"$r'02q+e9D]`ik*]&rP^h.7fa9!pbg:mm//e<i^9]M2M>N[M5aAn2'6OTn/qt8j)qd`?c1[?XKkLq*0s0^6DNmle:H[5N&W;]r$?ZDH><H[./HU[~>
+GatU29okbt&A@ZcHqY#)[_ZA7eD7.(<4,...@c>aui6tP6;[YgW`mJT_qb8>S]XlB)Pq>1_0(IHo?e[%IITsUgt6`mW'U$L;5Y-6_SWhJaf"4=XA[TZ[GW-ua2EUk"X3=VYdf/UL4`FX<ug31e>G%Eto0VE9-N!t'W`sC\UA*U27Y7*,^U0g:Q6oETUHE`MX@Bb!lHjX-SIuPSE+:l<&Fk__e3JQ='W4rYTR`I3pUW06o&`9GIV([#64URQ9iR]dPZd[eoM;hECD$\*A&gO56C>R(ti'@iNa6RPK?FID,=_6dgAQpKa$B;@/5mNRb0]K;Bo@NI8lDa^a^5,3C~>
 endstream
 endobj
 32 0 obj
@@ -259,10 +259,10 @@
 34 0 obj
 << /Type /Annot
 /Subtype /Link
-/Rect [ 247.668 660.8 416.004 648.8 ]
+/Rect [ 247.668 660.8 402.0 648.8 ]
 /C [ 0 0 0 ]
 /Border [ 0 0 0 ]
-/A << /URI (mailto:nutch-agent@incubator.apache.org)
+/A << /URI (mailto:nutch-agent@lucene.apache.org)
 /S /URI >>
 /H /I
 >>
@@ -273,7 +273,7 @@
 /Rect [ 108.0 630.4 189.336 618.4 ]
 /C [ 0 0 0 ]
 /Border [ 0 0 0 ]
-/A << /URI (mailto:nutch-agent-subscribe@incubator.apache.org)
+/A << /URI (mailto:nutch-agent-subscribe@lucene.apache.org)
 /S /URI >>
 /H /I
 >>
@@ -284,7 +284,7 @@
 /Rect [ 108.0 617.2 215.988 605.2 ]
 /C [ 0 0 0 ]
 /Border [ 0 0 0 ]
-/A << /URI (mailto:nutch-agent-unsubscribe@incubator.apache.org)
+/A << /URI (mailto:nutch-agent-unsubscribe@lucene.apache.org)
 /S /URI >>
 /H /I
 >>
@@ -295,7 +295,7 @@
 /Rect [ 108.0 604.0 197.316 592.0 ]
 /C [ 0 0 0 ]
 /Border [ 0 0 0 ]
-/A << /URI (http://incubator.apache.org/mail/nutch-agent/)
+/A << /URI (http://lucene.apache.org/mail/nutch-agent/)
 /S /URI >>
 /H /I
 >>
@@ -414,53 +414,53 @@
 xref
 0 48
 0000000000 65535 f 
-0000009417 00000 n 
-0000009489 00000 n 
-0000009581 00000 n 
+0000009354 00000 n 
+0000009426 00000 n 
+0000009518 00000 n 
 0000000015 00000 n 
 0000000071 00000 n 
 0000000619 00000 n 
 0000000739 00000 n 
 0000000785 00000 n 
-0000009704 00000 n 
+0000009641 00000 n 
 0000000920 00000 n 
-0000009767 00000 n 
+0000009704 00000 n 
 0000001057 00000 n 
-0000009833 00000 n 
+0000009770 00000 n 
 0000001194 00000 n 
-0000009899 00000 n 
+0000009836 00000 n 
 0000001331 00000 n 
-0000003815 00000 n 
-0000003938 00000 n 
-0000004042 00000 n 
-0000004234 00000 n 
-0000004434 00000 n 
-0000004636 00000 n 
-0000004832 00000 n 
-0000005023 00000 n 
-0000005222 00000 n 
-0000005423 00000 n 
-0000005618 00000 n 
-0000005792 00000 n 
-0000005995 00000 n 
-0000006200 00000 n 
-0000006399 00000 n 
-0000007349 00000 n 
-0000007472 00000 n 
-0000007520 00000 n 
-0000007709 00000 n 
-0000007906 00000 n 
-0000008105 00000 n 
-0000009965 00000 n 
-0000008298 00000 n 
-0000008419 00000 n 
-0000008585 00000 n 
-0000008733 00000 n 
-0000008861 00000 n 
-0000008974 00000 n 
-0000009084 00000 n 
-0000009192 00000 n 
-0000009308 00000 n 
+0000003807 00000 n 
+0000003930 00000 n 
+0000004034 00000 n 
+0000004221 00000 n 
+0000004418 00000 n 
+0000004617 00000 n 
+0000004810 00000 n 
+0000004997 00000 n 
+0000005193 00000 n 
+0000005391 00000 n 
+0000005583 00000 n 
+0000005757 00000 n 
+0000005957 00000 n 
+0000006159 00000 n 
+0000006355 00000 n 
+0000007300 00000 n 
+0000007423 00000 n 
+0000007471 00000 n 
+0000007655 00000 n 
+0000007849 00000 n 
+0000008045 00000 n 
+0000009902 00000 n 
+0000008235 00000 n 
+0000008356 00000 n 
+0000008522 00000 n 
+0000008670 00000 n 
+0000008798 00000 n 
+0000008911 00000 n 
+0000009021 00000 n 
+0000009129 00000 n 
+0000009245 00000 n 
 trailer
 <<
 /Size 48
@@ -468,5 +468,5 @@
 /Info 4 0 R
 >>
 startxref
-10016
+9953
 %%EOF

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java?rev=209663&r1=209662&r2=209663&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java Thu Jul  7 15:18:08 2005
@@ -92,11 +92,16 @@
 
               case ProtocolStatus.MOVED:         // redirect
               case ProtocolStatus.TEMP_MOVED:
-                url = status.getMessage();
-                if (url != null) {
+                String newUrl = status.getMessage();
+                newUrl = URLFilters.filter(newUrl);
+                if (newUrl != null && !newUrl.equals(url)) {
+                  url = newUrl;
                   redirecting = true;
                   redirectCount++;
-                  LOG.fine(" - protocol redirect to " + url);
+                  LOG.fine(" - redirect to " + url);
+                } else {
+                  LOG.fine(" - redirect skipped: " +
+                           (url.equals(newUrl) ? "to same url" : "filtered"));
                 }
                 break;
 

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=209663&r1=209662&r2=209663&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/fetcher/Fetcher.java Thu Jul  7 15:18:08 2005
@@ -115,7 +115,7 @@
           if (!fle.getFetch()) {                  // should we fetch this page?
             if (LOG.isLoggable(Level.FINE))
               LOG.fine("not fetching " + url);
-            handleNoFetch(fle, ProtocolStatus.STATUS_NOTFETCHING);
+            handleFetch(fle, new ProtocolOutput(null, ProtocolStatus.STATUS_NOTFETCHING));
             continue;
           }
 
@@ -124,7 +124,7 @@
           // in parsing mode). Protocol-level redirects take precedence over
           // content-level redirects. Some plugins can handle redirects
           // automatically, so that only the final success or failure will be
-          // shown? here.
+          // reported here.
           boolean refetch = false;
           int redirCnt = 0;
           do {
@@ -145,15 +145,19 @@
                       status();
                     }
                   }
-                  ParseStatus ps = handleFetch(url, fle, output);
+                  ParseStatus ps = handleFetch(fle, output);
                   if (ps != null && ps.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
-                    url = ps.getMessage();
-                    url = URLFilters.filter(url);
-                    if (url != null) {
+                    String newurl = ps.getMessage();
+                    newurl = URLFilters.filter(newurl);
+                    if (newurl != null && !newurl.equals(url)) {
                       refetch = true;
+                      url = newurl;
                       redirCnt++;
                       fle = new FetchListEntry(true, new Page(url, NEW_INJECTED_PAGE_SCORE), new String[0]);
-                      LOG.info(" - content redirect to " + url);
+                      LOG.fine(" - content redirect to " + url);
+                    } else {
+                      LOG.fine(" - content redirect skipped, " +
+                              (url.equals(newurl)? "newurl == url" : "prohibited by urlfilter"));
                     }
                   }
                 }
@@ -161,14 +165,19 @@
               case ProtocolStatus.MOVED: // try to redirect immediately
               case ProtocolStatus.TEMP_MOVED: // try to redirect immediately
                 // record the redirect. perhaps the DB will want to know this.
-                handleNoFetch(fle, pstat);
-                url = pstat.getMessage();
-                if (url != null) {
+                handleFetch(fle, output);
+                String newurl = pstat.getMessage();
+                newurl = URLFilters.filter(newurl);
+                if (newurl != null && !newurl.equals(url)) {
                   refetch = true;
+                  url = newurl;
                   redirCnt++;
                   // create new entry.
                   fle = new FetchListEntry(true, new Page(url, NEW_INJECTED_PAGE_SCORE), new String[0]);
                   LOG.info(" - protocol redirect to " + url);
+                } else {
+                  LOG.fine(" - protocol redirect skipped, " +
+                          (url.equals(newurl)? "newurl == url" : "prohibited by urlfilter"));
                 }
                 break;
               case ProtocolStatus.GONE:
@@ -177,22 +186,22 @@
               case ProtocolStatus.ROBOTS_DENIED:
               case ProtocolStatus.RETRY:
               case ProtocolStatus.NOTMODIFIED:
-                handleNoFetch(fle, pstat);
+                handleFetch(fle, output);
                 break;
               case ProtocolStatus.EXCEPTION:
                 logError(url, fle, new Exception(pstat.getMessage()));                // retry?
-                handleNoFetch(fle, pstat);
+                handleFetch(fle, output);
               break;
               default:
                 LOG.warning("Unknown ProtocolStatus: " + pstat.getCode());
-                handleNoFetch(fle, pstat);
+                handleFetch(fle, output);
             }
           } while (refetch && (redirCnt < MAX_REDIRECT));
 
         } catch (Throwable t) {                   // an unchecked exception
           if (fle != null) {
             logError(url, fle, t);                // retry?
-            handleNoFetch(fle, new ProtocolStatus(t));
+            handleFetch(fle, new ProtocolOutput(null, new ProtocolStatus(t)));
           }
         }
       }
@@ -220,59 +229,47 @@
       }
     }
 
-    private ParseStatus handleFetch(String url, FetchListEntry fle, ProtocolOutput output) {
+    private ParseStatus handleFetch(FetchListEntry fle, ProtocolOutput output) {
       Content content = output.getContent();
+      MD5Hash hash = null;
+      String url = fle.getPage().getURL().toString();
+      if (content == null) {
+        content = new Content(url, url, new byte[0], "", new Properties());
+        hash = MD5Hash.digest(url);
+      } else {
+        hash = MD5Hash.digest(content.getContent());
+      }
       ProtocolStatus protocolStatus = output.getStatus();
       if (!Fetcher.this.parsing) {
-        outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()),
-                protocolStatus),
+        outputPage(new FetcherOutput(fle, hash, protocolStatus),
                 content, null, null);
         return null;
       }
-
-        String contentType = content.getContentType();
-        Parser parser = null;
-        Parse parse = null;
-        ParseStatus status = null;
-        try {
-          parser = ParserFactory.getParser(contentType, url);
-          parse = parser.getParse(content);
-          status = parse.getData().getStatus();
-        } catch (Exception e) {
-          e.printStackTrace();
-          status = new ParseStatus(e);
-        }
-        if (status.isSuccess()) {
-          outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()),
-                  protocolStatus),
-                  content, new ParseText(parse.getText()), parse.getData());
-        } else {
-          LOG.info("fetch okay, but can't parse " + url + ", reason: "
-                  + status.toString());
-          outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()),
-                  protocolStatus),
-                  content, new ParseText(""),
-                  new ParseData(status, "", new Outlink[0], new Properties()));
-        }
-        return status;
-    }
-
-    private void handleNoFetch(FetchListEntry fle, ProtocolStatus status) {
-      String url = fle.getPage().getURL().toString();
-      MD5Hash hash = MD5Hash.digest(url);
-
-      if (Fetcher.this.parsing) {
-        outputPage(new FetcherOutput(fle, hash, status),
-                   new Content(url, url, new byte[0], "", new Properties()),
-                   new ParseText(""),
-                   new ParseData(ParseStatus.STATUS_NOTPARSED, "", new Outlink[0], new Properties()));
+      String contentType = content.getContentType();
+      Parser parser = null;
+      Parse parse = null;
+      ParseStatus status = null;
+      try {
+        parser = ParserFactory.getParser(contentType, url);
+        parse = parser.getParse(content);
+        status = parse.getData().getStatus();
+      } catch (Exception e) {
+        e.printStackTrace();
+        status = new ParseStatus(e);
+      }
+      if (status.isSuccess()) {
+        outputPage(new FetcherOutput(fle, hash, protocolStatus),
+                content, new ParseText(parse.getText()), parse.getData());
       } else {
-        outputPage(new FetcherOutput(fle, hash, status),
-                   new Content(url, url, new byte[0], "", new Properties()),
-                   null, null);
+        LOG.info("fetch okay, but can't parse " + url + ", reason: "
+                + status.toString());
+        outputPage(new FetcherOutput(fle, hash, protocolStatus),
+                content, new ParseText(""),
+                new ParseData(status, "", new Outlink[0], new Properties()));
       }
+      return status;
     }
-      
+
     private void outputPage(FetcherOutput fo, Content content,
                             ParseText text, ParseData parseData) {
       try {
@@ -363,8 +360,8 @@
       for (int i = 0; i < n; i++) {
         // this thread may have gone away in the meantime
         if (list[i] == null) continue;
-        String name = list[i].getName();
-        if (name.startsWith(THREAD_GROUP_NAME)) // prove it
+        String tname = list[i].getName();
+        if (tname.startsWith(THREAD_GROUP_NAME)) // prove it
           noMoreFetcherThread = false;
         if (LOG.isLoggable(Level.FINE))
           LOG.fine(list[i].toString());
@@ -446,7 +443,6 @@
   /** Run the fetcher. */
   public static void main(String[] args) throws Exception {
     int threadCount = -1;
-    long delay = -1;
     String logLevel = "info";
     boolean parsing = true;
     boolean showThreadID = false;

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/ParseStatus.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/ParseStatus.java?rev=209663&r1=209662&r2=209663&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/ParseStatus.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/ParseStatus.java Thu Jul  7 15:18:08 2005
@@ -66,25 +66,6 @@
   public static final ParseStatus STATUS_SUCCESS = new ParseStatus(SUCCESS);
   public static final ParseStatus STATUS_FAILURE = new ParseStatus(FAILED);
   
-
-  private static class EmptyParseImpl implements Parse {
-  
-    private ParseData data = null;
-  
-    public EmptyParseImpl(ParseStatus status) {
-      data = new ParseData(status, "", new Outlink[0], new Properties());
-    }
-  
-    public ParseData getData() {
-      return data;
-    }
-
-    public String getText() {
-      return "";
-    }
-  }
-
-
   private byte majorCode = 0;
   private short minorCode = 0;
   private String[] args = null;
@@ -187,7 +168,10 @@
   
   public String toString() {
     StringBuffer res = new StringBuffer();
-    res.append(majorCodes[majorCode] + "(" + majorCode + "," + minorCode + ")");
+    String name = null;
+    if (majorCode >= 0 && majorCode < majorCodes.length) name = majorCodes[majorCode];
+    else name = "UNKNOWN!";
+    res.append(name + "(" + majorCode + "," + minorCode + ")");
     if (args != null) {
       if (args.length == 1) {
         res.append(": " + String.valueOf(args[0]));
@@ -239,6 +223,23 @@
       }
     }
     return true;
+  }
+  
+  private static class EmptyParseImpl implements Parse {
+    
+    private ParseData data = null;
+    
+    public EmptyParseImpl(ParseStatus status) {
+      data = new ParseData(status, "", new Outlink[0], new Properties());
+    }
+    
+    public ParseData getData() {
+      return data;
+    }
+
+    public String getText() {
+      return "";
+    }
   }
 }
 

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ProtocolStatus.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ProtocolStatus.java?rev=209663&r1=209662&r2=209663&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ProtocolStatus.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ProtocolStatus.java Thu Jul  7 15:18:08 2005
@@ -19,6 +19,7 @@
 import java.io.DataInput;
 import java.io.DataOutput;
 import java.io.IOException;
+import java.util.HashMap;
 
 import org.apache.nutch.io.VersionedWritable;
 import org.apache.nutch.io.WritableUtils;
@@ -77,6 +78,24 @@
   private long lastModified;
   private String[] args;
   
+  private static HashMap codeToName = new HashMap();
+  static {
+    codeToName.put(new Integer(SUCCESS), "success");
+    codeToName.put(new Integer(FAILED), "failed");
+    codeToName.put(new Integer(PROTO_NOT_FOUND), "proto_not_found");
+    codeToName.put(new Integer(GONE), "gone");
+    codeToName.put(new Integer(MOVED), "moved");
+    codeToName.put(new Integer(TEMP_MOVED), "temp_moved");
+    codeToName.put(new Integer(NOTFOUND), "notfound");
+    codeToName.put(new Integer(RETRY), "retry");
+    codeToName.put(new Integer(EXCEPTION), "exception");
+    codeToName.put(new Integer(ACCESS_DENIED), "access_denied");
+    codeToName.put(new Integer(ROBOTS_DENIED), "robots_denied");
+    codeToName.put(new Integer(REDIR_EXCEEDED), "redir_exceeded");
+    codeToName.put(new Integer(NOTFETCHING), "notfetching");
+    codeToName.put(new Integer(NOTMODIFIED), "notmodified");
+  }
+  
   public ProtocolStatus() {
     
   }
@@ -215,7 +234,7 @@
   
   public String toString() {
     StringBuffer res = new StringBuffer();
-    res.append("(" + code + "), lastModified=" + lastModified);
+    res.append(codeToName.get(new Integer(code)) + "(" + code + "), lastModified=" + lastModified);
     if (args != null) {
       if (args.length == 1) {
         res.append(": " + String.valueOf(args[0]));

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/segment/SegmentReader.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/segment/SegmentReader.java?rev=209663&r1=209662&r2=209663&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/segment/SegmentReader.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/segment/SegmentReader.java Thu Jul  7 15:18:08 2005
@@ -528,6 +528,7 @@
         cnt++;
         if (dump) reader.dump(sorted, System.out);
       } catch (Throwable t) {
+        t.printStackTrace();
         LOG.warning(t.getMessage());
       }
     }

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/tools/ParseSegment.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/tools/ParseSegment.java?rev=209663&r1=209662&r2=209663&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/tools/ParseSegment.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/tools/ParseSegment.java Thu Jul  7 15:18:08 2005
@@ -183,31 +183,22 @@
             continue;
           }
 
-          // if fetch was successful or
-          // previously unable to parse (so try again)
-          ProtocolStatus ps = fetcherOutput.getProtocolStatus();
-          if (ps.isSuccess()) {
-            handleContent(url, content);
-            synchronized (ParseSegment.this) {
-              pages++;                    // record successful parse
-              bytes += content.getContent().length;
-              if ((pages % 100) == 0)
-                status();
-            }
-          } else {
-            // errored at fetch step
-            logError(url, new ProtocolException("Error at fetch stage: " + ps));
-            handleNoContent(new ParseStatus(ParseStatus.FAILED_MISSING_CONTENT));
+          handleContent(fetcherOutput, content);
+          synchronized (ParseSegment.this) {
+            pages++;                    // record successful parse
+            bytes += content.getContent().length;
+            if ((pages % 100) == 0)
+              status();
           }
 
         } catch (ParseException e) {
           logError(url, e);
-          handleNoContent(new ParseStatus(e));
+          handleError(new ParseStatus(e));
 
         } catch (Throwable t) {                   // an unchecked exception
           if (fle != null) {
             logError(url, t);
-            handleNoContent(new ParseStatus(t));
+            handleError(new ParseStatus(t));
           } else {
             LOG.severe("Unexpected exception");
           }
@@ -224,27 +215,35 @@
       }
     }
 
-    private void handleContent(String url, Content content)
+    private void handleContent(FetcherOutput fo, Content content)
       throws ParseException {
 
-      //String contentType = content.getContentType();
-      String contentType = content.getMetadata().getProperty("Content-Type");
+      String url = fo.getUrl().toString();
+      if (content != null) {
+        String contentType = content.getMetadata().getProperty("Content-Type");
+        if (ParseSegment.this.dryRun) {
+          LOG.info("To be handled as Content-Type: "+contentType);
+          return;
+        }
 
-      if (ParseSegment.this.dryRun) {
-        LOG.info("To be handled as Content-Type: "+contentType);
-        return;
+        Parser parser = ParserFactory.getParser(contentType, url);
+        Parse parse = parser.getParse(content);
+        outputPage(new ParseText(parse.getText()), parse.getData());
+        
+      } else {
+        if (ParseSegment.this.dryRun) {
+          LOG.info("To be handled as no content");
+          return;
+        }
+        outputPage(new ParseText(""),
+                new ParseData(new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_MISSING_CONTENT),
+                        "", new Outlink[0], new Properties()));
       }
-
-      Parser parser = ParserFactory.getParser(contentType, url);
-      Parse parse = parser.getParse(content);
-
-      outputPage
-        (new ParseText(parse.getText()), parse.getData());
     }
 
-    private void handleNoContent(ParseStatus status) {
+    private void handleError(ParseStatus status) {
       if (ParseSegment.this.dryRun) {
-        LOG.info("To be handled as no content");
+        LOG.info("To be handled as error");
         return;
       }
       outputPage(new ParseText(""),
@@ -267,6 +266,7 @@
               +" wait="+(t4-t3) +" write="+(t5-t4) +"ms");
         }
       } catch (Throwable t) {
+        t.printStackTrace();
         LOG.severe("error writing output:" + t.toString());
       }
     }

Modified: lucene/nutch/branches/mapred/src/plugin/languageidentifier/build.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/languageidentifier/build.xml?rev=209663&r1=209662&r2=209663&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/languageidentifier/build.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/languageidentifier/build.xml Thu Jul  7 15:18:08 2005
@@ -9,6 +9,10 @@
     <copy todir="${build.classes}">
       <fileset dir="${src.dir}" includes="**/*.ngp, **/*.properties"/>
     </copy>
+    <echo>Copying test files</echo>
+    <copy todir="${build.test}">
+      <fileset dir="${src.test}" includes="**/*.test, **/*.txt"/>
+    </copy>
   </target>
 	
 </project>

Modified: lucene/nutch/branches/mapred/src/plugin/languageidentifier/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/languageidentifier/plugin.xml?rev=209663&r1=209662&r2=209663&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/languageidentifier/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/languageidentifier/plugin.xml Thu Jul  7 15:18:08 2005
@@ -34,7 +34,7 @@
               name="Nutch language identifier filter"
               point="org.apache.nutch.indexer.IndexingFilter">
       <implementation id="LanguageIdentifier"
-                      class="org.apache.nutch.analysis.lang.LanguageIdentifier"/>
+                      class="org.apache.nutch.analysis.lang.LanguageIndexingFilter"/>
    </extension>
 
 

Modified: lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java?rev=209663&r1=209662&r2=209663&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java Thu Jul  7 15:18:08 2005
@@ -15,83 +15,153 @@
  */
 package org.apache.nutch.analysis.lang;
 
-import java.io.BufferedReader;
+// JDK imports
 import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
 import java.io.InputStream;
+import java.io.IOException;
+import java.io.BufferedReader;
+import java.io.FileInputStream;
 import java.io.InputStreamReader;
-import java.util.Iterator;
+import java.util.List;
 import java.util.Vector;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.ArrayList;
+import java.util.Properties;
+import java.util.Enumeration;
 import java.util.logging.Logger;
 
-import org.apache.nutch.fetcher.FetcherOutput;
-import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.indexer.IndexingFilter;
+// Nutch imports
+import org.apache.nutch.analysis.lang.NGramProfile.NGramEntry;
 import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseException;
 import org.apache.nutch.parse.Parser;
 import org.apache.nutch.parse.ParserFactory;
+import org.apache.nutch.parse.ParseException;
 import org.apache.nutch.parse.ParserNotFound;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.ProtocolException;
 import org.apache.nutch.protocol.ProtocolFactory;
 import org.apache.nutch.protocol.ProtocolNotFound;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.util.NutchConf;
 import org.apache.nutch.util.LogFormatter;
 
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import java.util.Properties;
-import java.util.Enumeration;
 
 /**
  * 
  * @author Sami Siren
- *  
+ * @author Jerome Charron
  */
-public class LanguageIdentifier implements IndexingFilter {
-  public static final Logger LOG = LogFormatter.getLogger("org.apache.nutch.analysis.lang.LanguageIdentifier");
+public class LanguageIdentifier {
+  
+ 
+  private final static int DEFAULT_ANALYSIS_LENGTH = 0;    // 0 means full content
+  
+  private final static float SCORE_THRESOLD = 0.00F;
+
+  public final static Logger LOG = LogFormatter.getLogger(LanguageIdentifier.class.getName());
+
+  
+  private ArrayList languages = new ArrayList();
+
+  private ArrayList supportedLanguages = new ArrayList();
+
+  /** Minimum size of NGrams */
+  private int minLength = NGramProfile.DEFAULT_MIN_NGRAM_LENGTH;
+  
+  /** Maximum size of NGrams */
+  private int maxLength = NGramProfile.DEFAULT_MAX_NGRAM_LENGTH;
+  
+  /** The maximum amount of data to analyze */
+  private int analyzeLength = DEFAULT_ANALYSIS_LENGTH;
+  
+  /** A global index of ngrams of all supported languages */
+  private HashMap ngramsIdx = new HashMap();
 
-  private Vector languages = new Vector();
+  /** The NGramProfile used for identification */
+  private NGramProfile suspect = null;
 
-  private Vector supportedLanguages = new Vector();
+  /** My singleton instance */
+  private static LanguageIdentifier identifier = null;
 
-  private static LanguageIdentifier identifier = new LanguageIdentifier(true);
 
-  private static float SCORE_THRESOLD = 0.00F;
-
-  //public constructor needed for extension mechanism
-  public LanguageIdentifier() {}
+  /**
+   * Constructs a new Language Identifier.
+   */
+  private LanguageIdentifier() {
 
-  private LanguageIdentifier(boolean fake) {
+    // Gets ngram sizes to take into account from the Nutch Config
+    minLength = NutchConf.get().getInt("lang.ngram.min.length",
+                                       NGramProfile.DEFAULT_MIN_NGRAM_LENGTH);
+    maxLength = NutchConf.get().getInt("lang.ngram.max.length",
+                                       NGramProfile.DEFAULT_MAX_NGRAM_LENGTH);
+    // Ensure the min and max values are in an acceptale range
+    // (ie min >= DEFAULT_MIN_NGRAM_LENGTH and max <= DEFAULT_MAX_NGRAM_LENGTH)
+    maxLength = Math.min(maxLength, NGramProfile.ABSOLUTE_MAX_NGRAM_LENGTH);
+    maxLength = Math.max(maxLength, NGramProfile.ABSOLUTE_MIN_NGRAM_LENGTH);
+    minLength = Math.max(minLength, NGramProfile.ABSOLUTE_MIN_NGRAM_LENGTH);
+    minLength = Math.min(minLength, maxLength);
+
+    // Gets the value of the maximum size of data to analyze
+    analyzeLength = NutchConf.get().getInt("lang.analyze.max.length",
+                                           DEFAULT_ANALYSIS_LENGTH);
+    
     Properties p = new Properties();
     try {
       p.load(this.getClass().getResourceAsStream("langmappings.properties"));
 
       Enumeration alllanguages = p.keys();
+      
+      LOG.info(new StringBuffer()
+                .append("Language identifier configuration [")
+                .append(minLength).append("-").append(maxLength)
+                .append("/").append(analyzeLength).append("]").toString());
 
       StringBuffer list = new StringBuffer("Language identifier plugin supports:");
+      HashMap tmpIdx = new HashMap();
       while (alllanguages.hasMoreElements()) {
         String lang = (String) (alllanguages.nextElement());
 
         InputStream is = this.getClass().getClassLoader().getResourceAsStream(
-                "org/apache/nutch/analysis/lang/" + lang + "." + NGramProfile.NGRAM_FILE_EXTENSION);
+                "org/apache/nutch/analysis/lang/" + lang + "." + NGramProfile.FILE_EXTENSION);
 
         if (is != null) {
-          NGramProfile profile = new NGramProfile(lang);
+          NGramProfile profile = new NGramProfile(lang, minLength, maxLength);
           try {
             profile.load(is);
             languages.add(profile);
             supportedLanguages.add(lang);
-            list.append(" " + lang);
+            List ngrams = profile.getSorted();
+            for (int i=0; i<ngrams.size(); i++) {
+                NGramEntry entry = (NGramEntry) ngrams.get(i);
+                List registered = (List) tmpIdx.get(entry);
+                if (registered == null) {
+                    registered = new ArrayList();
+                    tmpIdx.put(entry, registered);
+                }
+                registered.add(entry);
+                entry.setProfile(profile);
+            }
+            list.append(" " + lang + "(" + ngrams.size() + ")");
             is.close();
           } catch (IOException e1) {
             LOG.severe(e1.toString());
           }
         }
       }
+      // transform all ngrams lists to arrays for performances
+      Iterator keys = tmpIdx.keySet().iterator();
+      while (keys.hasNext()) {
+        NGramEntry entry = (NGramEntry) keys.next();
+        List l = (List) tmpIdx.get(entry);
+        if (l != null) {
+          NGramEntry[] array = (NGramEntry[]) l.toArray(new NGramEntry[l.size()]);
+          ngramsIdx.put(entry.getSeq(), array);
+        }
+      }
       LOG.info(list.toString());
+      // Create the suspect profile
+      suspect = new NGramProfile("suspect", minLength, maxLength);
     } catch (Exception e) {
       LOG.severe(e.toString());
     }
@@ -101,6 +171,13 @@
    * return handle to singleton instance
    */
   public static LanguageIdentifier getInstance() {
+    if (identifier == null) {
+        synchronized(LanguageIdentifier.class) {
+            if (identifier == null) {
+                identifier = new LanguageIdentifier();
+            }
+        }
+    }
     return identifier;
   }
 
@@ -157,15 +234,24 @@
       if (args[i].equals("-identifyfileset")) {
         command = IDFILESET;
         for (i++; i < args.length; i++) {
-          fileset.add(args[i]);
-          System.out.println(args[i]);
+          File[] files = null;
+          File f = new File(args[i]);
+          if (f.isDirectory()) {
+              files = f.listFiles();
+          } else {
+              files = new File[] { f };
+          }
+          for (int j=0; j<files.length; j++) {
+            fileset.add(files[j].getAbsolutePath());
+          }
         }
       }
 
     }
 
     String lang = null;
-    LanguageIdentifier idfr = LanguageIdentifier.getInstance();
+    //LanguageIdentifier idfr = LanguageIdentifier.getInstance();
+    LanguageIdentifier idfr = new LanguageIdentifier();
     File f;
     FileInputStream fis;
     try {
@@ -205,9 +291,12 @@
           break;
 
         case IDFILESET:
+          /* used for benchs
+          for (int j=128; j<=524288; j*=2) {
+            long start = System.currentTimeMillis();
+            idfr.analyzeLength = j; */
           System.out.println("FILESET");
           Iterator i = fileset.iterator();
-
           while (i.hasNext()) {
             try {
               filename = (String) i.next();
@@ -218,12 +307,13 @@
             } catch (Exception e) {
               System.out.println(e);
             }
-
             System.out.println(filename + " was identified as " + lang);
           }
+          /* used for benchs
+            System.out.println(j + "/" + (System.currentTimeMillis()-start));
+          } */
           System.exit(0);
           break;
-
       }
     } catch (Exception e) {
       System.out.println(e);
@@ -261,46 +351,57 @@
   /**
    * Identify language based on submitted content
    * 
-   * @param text text of doc
+   * @param text to analyze
    * @return 2 letter ISO639 code of language (en, fi, sv...) , or null if
    *         unknown
    */
   public String identify(String text) {
-
     return identify(new StringBuffer(text));
   }
 
-  public String identify(StringBuffer text) {
+  /**
+   * Identify language based on submitted content
+   * 
+   * @param text to analyze
+   * @return 2 letter ISO639 code of language (en, fi, sv...) , or null if
+   *         unknown
+   */
+  public String identify(StringBuffer content) {
 
-    NGramProfile p = new NGramProfile("suspect");
-    p.analyze(text);
+    StringBuffer text = content;
+    if ((analyzeLength > 0) && (content.length() > analyzeLength)) {
+        text = new StringBuffer().append(content);
+        text.setLength(analyzeLength);
+    }
 
-    float topscore = Float.MAX_VALUE;
+    suspect.analyze(text);
+    Iterator iter = suspect.getSorted().iterator();
+    float topscore = Float.MIN_VALUE;
     String lang = "";
-
-    Iterator i = languages.iterator();
-    while (i.hasNext()) {
-
-      NGramProfile profile = (NGramProfile) i.next();
-      float score = profile.getSimilarity(p);
-
-      //LOG.fine(profile.getName() + ":" + score);
-
-      if (score < topscore) {
-        topscore = score;
-        lang = profile.getName();
-      }
+    HashMap scores = new HashMap();
+    NGramEntry searched = null;
+    
+    while (iter.hasNext()) {
+        searched = (NGramEntry) iter.next();
+        NGramEntry[] ngrams = (NGramEntry[]) ngramsIdx.get(searched.getSeq());
+        if (ngrams != null) {
+            for (int j=0; j<ngrams.length; j++) {
+                NGramProfile profile = ngrams[j].getProfile();
+                Float pScore = (Float) scores.get(profile);
+                if (pScore == null) {
+                    pScore = new Float(0);
+                }
+                float plScore = pScore.floatValue();
+                plScore += ngrams[j].getFrequency() + searched.getFrequency();
+                scores.put(profile, new Float(plScore));
+                if (plScore > topscore) {
+                    topscore = plScore;
+                    lang = profile.getName();
+                }
+            }
+        }
     }
-
-    p.ngrams.clear();
-    p = null;
-
-    LOG.finest("TOPSCORE: " + lang + " with " + topscore);
-
-    if (topscore > SCORE_THRESOLD)
-      return lang;
-
-    else return null;
+    return lang;
   }
 
   /**
@@ -313,42 +414,17 @@
   public String identify(InputStream is) throws IOException {
 
     StringBuffer text = new StringBuffer();
-    byte buffer[] = new byte[2000];
+    byte[] buffer = new byte[2048];
     int len = 0;
 
-    while ((len = is.read(buffer)) != -1) {
+    while (((len = is.read(buffer)) != -1) &&
+           ((analyzeLength == 0) || (text.length() < analyzeLength))) {
+      if (analyzeLength != 0) {
+          len = Math.min(len, analyzeLength - text.length());
+      }
       text.append(new String(buffer, 0, len));
     }
-
-    return identify(text.toString());
-  }
-
-  public Document filter(Document doc, Parse parse, FetcherOutput fo) throws IndexingException {
-
-    //check if X-meta-lang found, possibly put there by HTMLLanguageParser
-    String lang = parse.getData().get(HTMLLanguageParser.META_LANG_NAME);
-
-    //check if HTTP-header tels us the language
-    if (lang == null) lang = parse.getData().get("Content-Language");
-
-    if (lang == null) {
-      StringBuffer text = new StringBuffer();
-      /*
-       * String[] anchors = fo.getAnchors(); for (int i = 0; i < anchors.length;
-       * i++) { text+=anchors[i] + " "; }
-       */
-      text.append(parse.getData().getTitle()).append(" ");
-      text.append(parse.getText());
-      lang = LanguageIdentifier.getInstance().identify(text);
-    }
-
-    if (lang == null) {
-      lang = "unknown";
-    }
-
-    doc.add(Field.Keyword("lang", lang));
-
-    return doc;
+    return identify(text);
   }
 
 }

Modified: lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/NGramProfile.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/NGramProfile.java?rev=209663&r1=209662&r2=209663&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/NGramProfile.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/NGramProfile.java Thu Jul  7 15:18:08 2005
@@ -13,29 +13,34 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.analysis.lang;
 
-import java.io.BufferedInputStream;
-import java.io.BufferedReader;
+// JDK imports
 import java.io.File;
+import java.io.InputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.BufferedReader;
 import java.io.FileInputStream;
 import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
 import java.io.InputStreamReader;
-import java.io.OutputStream;
+import java.io.BufferedInputStream;
 import java.util.Date;
-import java.util.Collections;
-import java.util.Hashtable;
+import java.util.List;
 import java.util.Iterator;
-import java.util.Vector;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
 import java.util.logging.Logger;
 
+// Nutch imports
 import org.apache.nutch.util.LogFormatter;
 
+// Lucene imports
 import org.apache.lucene.analysis.Token;
 
+
 /**
  * This class runs a ngram analysis over submitted text, results might be used
  * for automatic language identifiaction.
@@ -45,257 +50,235 @@
  * Methods are provided to build new NGramProfiles profiles.
  * 
  * @author Sami Siren
+ * @author Jerome Charron - http://frutch.free.fr/
  */
 public class NGramProfile {
 
   public static final Logger LOG = LogFormatter
       .getLogger("org.apache.nutch.analysis.lang.NGramProfile");
 
-  private String name;
+  /** The minimum length allowed for a ngram. */
+  final static int ABSOLUTE_MIN_NGRAM_LENGTH = 1;
 
-  private Vector sorted = null;
+  /** The maximum length allowed for a ngram. */
+  final static int ABSOLUTE_MAX_NGRAM_LENGTH = 4;
+    
+  /** The default min length of ngram */
+  final static int DEFAULT_MIN_NGRAM_LENGTH = 3;
 
-  private StringBuffer tokensb = new StringBuffer();
+  /** The default max length of ngram */
+  final static int DEFAULT_MAX_NGRAM_LENGTH = 3;
 
-  private int min_ngram_length = DEFAULT_MIN_NGRAM_LENGTH;
+  /** The ngram profile file extension */
+  static final String FILE_EXTENSION = "ngp";
 
-  private int max_ngram_length = DEFAULT_MAX_NGRAM_LENGTH;
+  /** The profile max size (number of ngrams of the same size) */
+  static final int MAX_SIZE = 1000;
 
-  private int ngramcount = 0;
+  /** separator char */
+  static final char SEPARATOR = '_';
+  /** The String form of the separator char */  
+  private final static String SEP_CHARSEQ = new String(new char[] { SEPARATOR });
 
-  static final String NGRAM_FILE_EXTENSION = "ngp";
+  
+  /** The profile's name */
+  private String name = null;
 
-  static final int NGRAM_LENGTH = 1000;
+  /** The NGrams of this profile sorted on the number of occurences */
+  private List sorted = null;
 
-  //separator char
-  static final char SEPARATOR = '_';
+  /** The min length of ngram */
+  private int minLength = DEFAULT_MIN_NGRAM_LENGTH;
 
-  //default min length of ngram
-  static final int DEFAULT_MIN_NGRAM_LENGTH = 1;
+  /** The max length of ngram */
+  private int maxLength = DEFAULT_MAX_NGRAM_LENGTH;
 
-  //default max length of ngram
-  static final int DEFAULT_MAX_NGRAM_LENGTH = 4;
+  /** The total number of ngrams occurences */
+  private int[] ngramcounts = null;
 
-  //table to store ngrams
-  Hashtable ngrams = null;
+  /** An index of the ngrams of the profile */
+  private Map ngrams = null;
 
+  /** A StringBuffer used during analysis */
+  private QuickStringBuffer word = new QuickStringBuffer();
+  
+    
   /**
-   * private class used to store NGramEntry
+   * Construct a new ngram profile
+   * 
+   * @param name is the name of the profile
+   * @param minlen is the min length of ngram sequences
+   * @param maxlen is the max length of ngram sequences
    */
-  class NGramEntry implements Comparable {
-    private CharSequence seq;
-
-    private int count;
-
-    private float normalized_count;
-
-    public NGramEntry(CharSequence seq) {
-      this.seq = seq;
-    }
-
-    /**
-     * @param ngramsequence
-     * @param ngramcount
-     */
-    public NGramEntry(String ngramsequence, int ngramcount) {
-      seq = new StringBuffer(ngramsequence).subSequence(0, ngramsequence
-          .length());
-      this.count = ngramcount;
-    }
-
-    public int getCount() {
-      return count;
-    }
-
-    public CharSequence getSeq() {
-      return seq;
-    }
-
-    public int compareTo(Object o) {
-      if (((NGramEntry) o).count - count != 0)
-        return ((NGramEntry) o).count - count;
-      else
-        return (seq.toString().compareTo(((NGramEntry) o).seq.toString()));
-    }
-
-    public void inc() {
-      count++;
-    }
+  public NGramProfile(String name, int minlen, int maxlen) {
+    // TODO: Compute the initial capacity using minlen and maxlen.
+    this.ngrams = new HashMap(4000);
+    this.minLength = minlen;
+    this.maxLength = maxlen;
+    this.name = name;
   }
 
   /**
-   * Construct a new ngram profile
+   * @return Returns the name.
+   */
+  public String getName() {
+    return name;
+  }
+  
+  /**
+   * Add ngrams from a token to this profile
    * 
-   * @param name
-   *          Name of profile
+   * @param t is the Token to be added
    */
-  public NGramProfile(String name) {
-    this(name, DEFAULT_MIN_NGRAM_LENGTH, DEFAULT_MAX_NGRAM_LENGTH);
+  public void add(Token t) {
+    add(new StringBuffer().append(SEPARATOR)
+                          .append(t.termText())
+                          .append(SEPARATOR));
   }
 
   /**
-   * Construct a new ngram profile
+   * Add ngrams from a single word to this profile
    * 
-   * @param name
-   *          Name of profile
-   * @param minlen
-   *          min length of ngram sequences
-   * @param maxlen
-   *          max length of ngram sequences
+   * @param word is the word to add
    */
-  public NGramProfile(String name, int minlen, int maxlen) {
-    ngrams = new Hashtable();
-    this.max_ngram_length = maxlen;
-    this.min_ngram_length = minlen;
-    this.name = name;
+  public void add(StringBuffer word) {
+    for (int i=minLength; (i <= maxLength) && (i < word.length()); i++) {
+      add(word, i);
+    }
   }
 
   /**
-   * Add ngrams from a token to this profile
-   * 
-   * @param t
-   *          Token to be added
+   * Add the last NGrams from the specified word.
    */
-  public void addFromToken(Token t) {
-    tokensb.setLength(0);
-    tokensb.append(SEPARATOR).append(t.termText()).append(SEPARATOR);
-    addNGrams(tokensb);
+  private void add(QuickStringBuffer word) {
+    int wlen = word.length();
+    if (wlen >= minLength) {
+        int max = Math.min(maxLength, wlen);
+        for (int i=minLength; i<=max; i++) {
+            add(word.subSequence(wlen-i, wlen));
+        }
+    }
+  }
+  
+  /**
+   * Add ngrams from a single word in this profile
+   *
+   * @param word is the word to add
+   * @param n is the ngram size
+   */
+  private void add(CharSequence cs) {
+
+    if (cs.equals(SEP_CHARSEQ)) { return; }
+    NGramEntry nge = (NGramEntry) ngrams.get(cs);
+    if (nge == null) {
+      nge = new NGramEntry(cs);
+      ngrams.put(cs, nge);
+    }
+    nge.inc();
   }
 
   /**
    * Analyze a piece of text
    * 
-   * @param text
-   *          the text to be analyzed
+   * @param text the text to be analyzed
    */
   public void analyze(StringBuffer text) {
-    StringBuffer word;
-    int i;
 
     if (ngrams != null) {
       ngrams.clear();
+      sorted = null;
     }
 
-    word = new StringBuffer().append(SEPARATOR);
-    for (i = 0; i < text.length(); i++) {
+    word.clear().append(SEPARATOR);
+    for (int i = 0; i < text.length(); i++) {
       char c = Character.toLowerCase(text.charAt(i));
 
       if (Character.isLetter(c)) {
-        word.append(c);
+        add(word.append(c));
       } else {
         //found word boundary
         if (word.length() > 1) {
           //we have a word!
-          word.append(SEPARATOR);
-          addNGrams(word);
-          word.delete(0, word.length());
+          add(word.append(SEPARATOR));
+          word.clear().append(SEPARATOR);
         }
       }
     }
 
     if (word.length() > 1) {
-      //we have a last word
-      word.append(SEPARATOR);
-      addNGrams(word);
+      //we have a word!
+      add(word.append(SEPARATOR));
     }
     normalize();
   }
 
   /**
-   * Normalize profile
-   */
-  protected void normalize() {
-    Vector sorted = getSorted();
-    int sum = 0;
-
-    //only calculate ngramcount if it was not available in profile
-    if (ngramcount == 0) {
-      for (int i = 0; i < sorted.size(); i++) {
-        ngramcount += ((NGramEntry) sorted.get(i)).count;
-      }
-    }
-
-    if (sorted.size() > 0) {
-      Iterator i = sorted.iterator();
-
-      while (i.hasNext()) {
-        NGramEntry e = (NGramEntry) i.next();
-        e.normalized_count = e.count / (float)ngramcount;
-      }
-    }
-  }
-
-  /**
-   * Add ngrams from a single word to this profile
-   * 
    * @param word
+   * @param n sequence length
    */
-  public void addNGrams(StringBuffer word) {
-    int i;
-
-    for (i = min_ngram_length; i <= max_ngram_length && i < word.length(); i++) {
-      addNGrams(word, i);
+  private void add(StringBuffer word, int n) {
+    for (int i=0; i <= word.length()-n; i++) {
+      add(word.subSequence(i, i + n));
     }
   }
-
+    
   /**
-   * @param word
-   * @param n
-   *          sequence length
+   * Normalize the profile (calculates the ngrams frequencies)
    */
-  private void addNGrams(StringBuffer word, int n) {
-    NGramEntry nge;
-    StringBuffer sb;
-    int i;
-
-    for (i = 0; i <= word.length() - n; i++) {
-
-      CharSequence cs = word.subSequence(i, i + n);
+  protected void normalize() {
 
-      if (ngrams.containsKey(cs)) {
-        nge = (NGramEntry) ngrams.get(cs);
-      } else {
-        nge = new NGramEntry(cs);
+    NGramEntry e = null;
+    //List sorted = getSorted();
+    Iterator i = ngrams.values().iterator();
+
+    // Calculate ngramcount if not already done
+    if (ngramcounts == null) {
+      ngramcounts = new int[maxLength+1];
+      while (i.hasNext()) {
+        e = (NGramEntry) i.next();
+        ngramcounts[e.size()] += e.count;
       }
-      nge.inc();
-      ngrams.put(cs, nge);
+    }
+    
+    i = ngrams.values().iterator();
+    while (i.hasNext()) {
+      e = (NGramEntry) i.next();
+      e.frequency = (float) e.count / (float) ngramcounts[e.size()];
     }
   }
 
   /**
-   * Return sorted vector of ngrams (sort done by 1. count 2. sequence)
+   * Return a sorted list of ngrams (sort done by 1. frequency 2. sequence)
    * 
    * @return sorted vector of ngrams
    */
-  public Vector getSorted() {
-    //make sure srting is done only once
+  public List getSorted() {
+    // make sure sorting is done only once
     if (sorted == null) {
-      sorted = new Vector(ngrams.values());
+      sorted = new ArrayList(ngrams.values());
       Collections.sort(sorted);
 
-      //trim at NGRAM_LENGTH entries
-      if (sorted.size() > NGRAM_LENGTH)
-        sorted.setSize(NGRAM_LENGTH);
+      // trim at NGRAM_LENGTH entries
+      if (sorted.size() > MAX_SIZE) {
+        sorted = sorted.subList(0, MAX_SIZE);
+      }
     }
-
     return sorted;
   }
-
-  /**
-   * Return ngramprofile as text
-   * 
-   * @return ngramprofile as text
-   */
+  
+  // Inherited JavaDoc
   public String toString() {
-    StringBuffer s = new StringBuffer();
+
+    StringBuffer s = new StringBuffer().append("NGramProfile: ")
+                                       .append(name).append("\n");
 
     Iterator i = getSorted().iterator();
 
-    s.append("NGramProfile: ").append(name).append("\n");
     while (i.hasNext()) {
       NGramEntry entry = (NGramEntry) i.next();
-      s.append(entry.count).append(':').append(entry.seq).append(" ").append(
-          entry.normalized_count).append("\n");
+      s.append("[").append(entry.seq)
+       .append("/").append(entry.count)
+       .append("/").append(entry.frequency).append("]\n");
     }
     return s.toString();
   }
@@ -308,6 +291,7 @@
    * @return similarity 0=exact match
    */
   public float getSimilarity(NGramProfile another) {
+      
     float sum = 0;
 
     try {
@@ -315,21 +299,20 @@
       while (i.hasNext()) {
         NGramEntry other = (NGramEntry) i.next();
         if (ngrams.containsKey(other.seq)) {
-          sum += Math.abs((other.normalized_count - ((NGramEntry) ngrams
-              .get(other.seq)).normalized_count)) / 2;
+          sum += Math.abs((other.frequency -
+                          ((NGramEntry) ngrams.get(other.seq)).frequency)) / 2;
         } else {
-          sum += other.normalized_count;
+          sum += other.frequency;
         }
       }
       i = getSorted().iterator();
       while (i.hasNext()) {
         NGramEntry other = (NGramEntry) i.next();
         if (another.ngrams.containsKey(other.seq)) {
-          sum += Math
-              .abs((other.normalized_count - ((NGramEntry) another.ngrams
-                  .get(other.seq)).normalized_count)) / 2;
+          sum += Math.abs((other.frequency -
+                          ((NGramEntry) another.ngrams.get(other.seq)).frequency)) / 2;
         } else {
-          sum += other.normalized_count;
+          sum += other.frequency;
         }
       }
     } catch (Exception e) {
@@ -339,27 +322,29 @@
   }
 
   /**
-   * Loads a ngram profile from InputStream (assumes UTF-8 encoded content)
+   * Loads a ngram profile from an InputStream
+   * (assumes UTF-8 encoded content)
+   * @param is the InputStream to read
    */
   public void load(InputStream is) throws IOException {
-    BufferedReader bis = new BufferedReader(new InputStreamReader(is, "UTF-8"));
-    String line;
 
     ngrams.clear();
+    ngramcounts = new int[maxLength+1];
+    BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"));
+    String line = null;
 
-    while ((line = bis.readLine()) != null) {
+    while ((line = reader.readLine()) != null) {
 
       // # starts a comment line
       if (line.charAt(0) != '#') {
         int spacepos = line.indexOf(' ');
         String ngramsequence = line.substring(0, spacepos).trim();
-        int ngramcount = Integer.parseInt(line.substring(spacepos + 1));
-
-        if (!line.startsWith("ngram_count")) {
-          NGramEntry en = new NGramEntry(ngramsequence, ngramcount);
-          ngrams.put(en.getSeq(), en);
-        } else {
-          this.ngramcount = ngramcount;
+        int len = ngramsequence.length();
+        if ((len >= minLength) && (len <= maxLength)) {
+            int ngramcount = Integer.parseInt(line.substring(spacepos + 1));
+            NGramEntry en = new NGramEntry(ngramsequence, ngramcount);
+            ngrams.put(en.getSeq(), en);
+            ngramcounts[len] += ngramcount;
         }
       }
     }
@@ -369,16 +354,14 @@
   /**
    * Create a new Language profile from (preferably quite large) text file
    * 
-   * @param name
-   *          name of profile
-   * @param is
-   * @param encoding
-   *          encoding of stream
+   * @param name is thename of profile
+   * @param is is the stream to read
+   * @param encoding is the encoding of stream
    */
-  public static NGramProfile createNgramProfile(String name, InputStream is,
-      String encoding) {
+  public static NGramProfile create(String name, InputStream is, String encoding) {
 
-    NGramProfile newProfile = new NGramProfile(name);
+    NGramProfile newProfile = new NGramProfile(name, ABSOLUTE_MIN_NGRAM_LENGTH,
+                                                     ABSOLUTE_MAX_NGRAM_LENGTH);
     BufferedInputStream bis = new BufferedInputStream(is);
 
     byte buffer[] = new byte[4096];
@@ -394,7 +377,6 @@
     }
 
     newProfile.analyze(text);
-
     return newProfile;
   }
 
@@ -402,25 +384,42 @@
    * Writes NGramProfile content into OutputStream, content is outputted with
    * UTF-8 encoding
    * 
-   * @param os
-   *          Stream to output to
+   * @param os the Stream to output to
    * @throws IOException
    */
-
   public void save(OutputStream os) throws IOException {
-    Vector v = getSorted();
-    Iterator i = v.iterator();
-    os
-        .write(("# NgramProfile generated at " + new Date() + " for Nutch Language Identification\n")
-            .getBytes());
-    os.write(("ngram_count " + ngramcount + "\n").getBytes());
 
-    while (i.hasNext()) {
-      NGramEntry e = (NGramEntry) i.next();
-      String line = e.getSeq().toString() + " " + e.getCount() + "\n";
+    // Write header
+    os.write(("# NgramProfile generated at " + new Date() +
+              " for Nutch Language Identification\n").getBytes());
+
+    // And then each ngram
+    
+    // First dispatch ngrams in many lists depending on their size
+    // (one list for each size, in order to store MAX_SIZE ngrams for each
+    // size of ngram)
+    int count = 0;
+    List list = new ArrayList();
+    List sublist = new ArrayList();
+    NGramEntry[] entries = (NGramEntry[]) ngrams.values().toArray(new NGramEntry[ngrams.size()]);
+    for (int i=minLength; i<=maxLength; i++) {
+      for (int j=0; j<entries.length; j++) {
+        if (entries[j].getSeq().length() == i) {
+          sublist.add(entries[j]);
+        }
+      }
+      Collections.sort(sublist);
+      if (sublist.size() > MAX_SIZE) {
+        sublist = sublist.subList(0, MAX_SIZE);
+      }
+      list.addAll(sublist);
+      sublist.clear();
+    }
+    for (int i=0; i<list.size(); i++) {
+      NGramEntry e = (NGramEntry) list.get(i);
+      String line = e.toString() + " " + e.getCount() + "\n";
       os.write(line.getBytes("UTF-8"));
     }
-
     os.flush();
   }
 
@@ -431,7 +430,10 @@
    */
   public static void main(String args[]) {
 
-    String usage = "Usage: NGramProfile [-create profilename filename encoding] [-similarity file1 file2] [-score profile-name filename encoding]";
+    String usage = "Usage: NGramProfile " +
+                   "[-create profilename filename encoding] " +
+                   "[-similarity file1 file2] "+
+                   "[-score profile-name filename encoding]";
     int command = 0;
 
     final int CREATE = 1;
@@ -442,7 +444,7 @@
     String filename = "";
     String filename2 = "";
     String encoding = "";
-
+    
     if (args.length == 0) {
       System.err.println(usage);
       System.exit(-1);
@@ -479,43 +481,40 @@
 
         File f = new File(filename);
         FileInputStream fis = new FileInputStream(f);
-        NGramProfile newProfile = NGramProfile.createNgramProfile(profilename,
-            fis, encoding);
+        NGramProfile newProfile = NGramProfile.create(profilename, fis, encoding);
         fis.close();
-        f = new File(profilename + "." + NGRAM_FILE_EXTENSION);
+        f = new File(profilename + "." + FILE_EXTENSION);
         FileOutputStream fos = new FileOutputStream(f);
         newProfile.save(fos);
-        System.out.println("new profile " + profilename + "."
-            + NGRAM_FILE_EXTENSION + " was created.");
+        System.out.println("new profile " + profilename + "." + FILE_EXTENSION + " was created.");
         break;
 
       case SIMILARITY:
 
         f = new File(filename);
         fis = new FileInputStream(f);
-        newProfile = NGramProfile.createNgramProfile(filename, fis, encoding);
+        newProfile = NGramProfile.create(filename, fis, encoding);
         newProfile.normalize();
 
         f = new File(filename2);
         fis = new FileInputStream(f);
-        NGramProfile newProfile2 = NGramProfile.createNgramProfile(filename2,
-            fis, encoding);
+        NGramProfile newProfile2 = NGramProfile.create(filename2, fis, encoding);
         newProfile2.normalize();
-        System.out.println("Similarity is "
-            + newProfile.getSimilarity(newProfile2));
+        System.out.println("Similarity is " + newProfile.getSimilarity(newProfile2));
         break;
 
       case SCORE:
         f = new File(filename);
         fis = new FileInputStream(f);
-        newProfile = NGramProfile.createNgramProfile(filename, fis, encoding);
+        newProfile = NGramProfile.create(filename, fis, encoding);
 
-        f = new File(profilename + "." + NGRAM_FILE_EXTENSION);
+        f = new File(profilename + "." + FILE_EXTENSION);
         fis = new FileInputStream(f);
-        NGramProfile compare = new NGramProfile(profilename);
+        NGramProfile compare = new NGramProfile(profilename,
+                                                DEFAULT_MIN_NGRAM_LENGTH,
+                                                DEFAULT_MAX_NGRAM_LENGTH);
         compare.load(fis);
         System.out.println("Score is " + compare.getSimilarity(newProfile));
-
         break;
 
       }
@@ -525,18 +524,217 @@
     }
   }
 
+  
   /**
-   * @return Returns the name.
+   * Inner class that describes a NGram
    */
-  public String getName() {
-    return name;
+  class NGramEntry implements Comparable {
+
+    /** The NGRamProfile this NGram is related to */
+    private NGramProfile profile = null;
+
+    /** The sequence of characters of the ngram */
+    CharSequence seq = null;
+
+    /** The number of occurences of this ngram in its profile */
+    private int count = 0;
+
+    /** The frequency of this ngram in its profile */
+    private float frequency = 0.0F;
+
+    
+    /** 
+     * Constructs a new NGramEntry
+     * @param seq is the sequence of characters of the ngram
+     */
+    public NGramEntry(CharSequence seq) {
+      this.seq = seq;
+    }
+
+    /** 
+     * Constructs a new NGramEntry
+     * @param seq is the sequence of characters of the ngram
+     * @param count is the number of occurences of this ngram
+     */
+    public NGramEntry(String seq, int count) {
+      this.seq = new StringBuffer(seq).subSequence(0, seq.length());
+      this.count = count;
+    }
+
+    
+    /**
+     * Returns the number of occurences of this ngram in its profile
+     * @return the number of occurences of this ngram in its profile
+     */
+    public int getCount() {
+      return count;
+    }
+    
+    /**
+     * Returns the frequency of this ngram in its profile
+     * @return the frequency of this ngram in its profile
+     */
+    public float getFrequency() {
+        return frequency;
+    }
+
+    /**
+     * Returns the sequence of characters of this ngram
+     * @return the sequence of characters of this ngram
+     */
+    public CharSequence getSeq() {
+      return seq;
+    }
+
+    /**
+     * Returns the size of this ngram
+     * @return the size of this ngram
+     */
+    public int size() {
+        return seq.length();
+    }
+    
+    // Inherited JavaDoc
+    public int compareTo(Object o) {
+      NGramEntry ngram = (NGramEntry) o;
+      int diff = Float.compare(ngram.getFrequency(), frequency);
+      if (diff != 0) {
+        return diff;
+      } else {
+        return (toString().compareTo(ngram.toString()));
+      }
+    }
+
+    /**
+     * Increments the number of occurences of this ngram.
+     */
+    public void inc() {
+      count++;
+    }
+
+    /**
+     * Associated a profile to this ngram
+     * @param profile is the profile associated to this ngram
+     */
+    public void setProfile(NGramProfile profile) {
+        this.profile = profile;
+    }
+
+    /**
+     * Returns the profile associated to this ngram
+     * @return the profile associated to this ngram
+     */
+    public NGramProfile getProfile() {
+        return profile;
+    }
+
+    // Inherited JavaDoc
+    public String toString() {
+        return seq.toString();
+    }
+
+    // Inherited JavaDoc
+    public int hashCode() {
+        return seq.hashCode();
+    }
+    
+    // Inherited JavaDoc
+    public boolean equals(Object obj) {
+        
+        NGramEntry ngram = null;
+        try {
+            ngram = (NGramEntry) obj;
+            return ngram.seq.equals(seq);
+        } catch (Exception e) {
+            return false;
+        }
+    }
+
   }
 
-  /**
-   * @param name
-   *          The name to set.
-   */
-  public void setName(String name) {
-    this.name = name;
+  
+  private class QuickStringBuffer implements CharSequence {
+
+    private char value[];
+
+    private int count;
+
+    QuickStringBuffer() {
+      this(16);
+    }
+
+    QuickStringBuffer(char[] value) {
+      this.value = value;
+      count = value.length;
+    }
+    
+    QuickStringBuffer(int length) {
+      value = new char[length];
+    }
+
+    QuickStringBuffer(String str) {
+      this(str.length() + 16);
+      append(str);
+    }
+
+    public int length() {
+      return count;
+    }
+
+    private void expandCapacity(int minimumCapacity) {
+      int newCapacity = (value.length + 1) * 2;
+      if (newCapacity < 0) {
+        newCapacity = Integer.MAX_VALUE;
+      } else if (minimumCapacity > newCapacity) {
+          newCapacity = minimumCapacity;
+      }
+	
+      char newValue[] = new char[newCapacity];
+      System.arraycopy(value, 0, newValue, 0, count);
+      value = newValue;
+    }
+
+    QuickStringBuffer clear() {
+      count = 0;
+      return this;
+    }
+
+    public char charAt(int index) {
+      return value[index];
+    }
+
+    QuickStringBuffer append(String str) {
+      if (str == null) {
+        str = String.valueOf(str);
+      }
+
+      int len = str.length();
+      int newcount = count + len;
+      if (newcount > value.length) {
+        expandCapacity(newcount);
+      }
+      str.getChars(0, len, value, count);
+      count = newcount;
+      return this;
+    }
+
+    QuickStringBuffer append(char c) {
+      int newcount = count + 1;
+      if (newcount > value.length) {
+        expandCapacity(newcount);
+      }
+      value[count++] = c;
+      return this;
+    }
+
+    public CharSequence subSequence(int start, int end) {
+      return new String(value, start, end - start);
+    }
+        
+    public String toString() {
+      return new String(this.value);
+    }
   }
+  
+  
 }