You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2005/07/08 00:18:13 UTC
svn commit: r209663 [1/12] - in /lucene/nutch/branches/mapred: conf/ site/
src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/
src/java/org/apache/nutch/parse/ src/java/org/apache/nutch/protocol/
src/java/org/apache/nutch/segment/ src/j...
Author: cutting
Date: Thu Jul 7 15:18:08 2005
New Revision: 209663
URL: http://svn.apache.org/viewcvs?rev=209663&view=rev
Log:
svn merge -r 190963:209656 from trunk
Modified:
lucene/nutch/branches/mapred/conf/nutch-default.xml
lucene/nutch/branches/mapred/site/mailing_lists.html
lucene/nutch/branches/mapred/site/mailing_lists.pdf
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fetcher/Fetcher.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/ParseStatus.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ProtocolStatus.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/segment/SegmentReader.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/tools/ParseSegment.java
lucene/nutch/branches/mapred/src/plugin/languageidentifier/build.xml
lucene/nutch/branches/mapred/src/plugin/languageidentifier/plugin.xml
lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/NGramProfile.java
lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/da.ngp
lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/de.ngp
lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/el.ngp
lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/en.ngp
lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/es.ngp
lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/fi.ngp
lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/fr.ngp
lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/hu.ngp
lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/it.ngp
lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/nl.ngp
lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/pl.ngp
lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/pt.ngp
lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/sv.ngp
lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestNGramProfile.java
lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
lucene/nutch/branches/mapred/src/plugin/parse-js/plugin.xml
lucene/nutch/branches/mapred/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
lucene/nutch/branches/mapred/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java
lucene/nutch/branches/mapred/src/site/src/documentation/content/xdocs/mailing_lists.xml
Modified: lucene/nutch/branches/mapred/conf/nutch-default.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/conf/nutch-default.xml?rev=209663&r1=209662&r2=209663&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/conf/nutch-default.xml (original)
+++ lucene/nutch/branches/mapred/conf/nutch-default.xml Thu Jul 7 15:18:08 2005
@@ -23,6 +23,15 @@
</property>
<property>
+ <name>http.robots.403.allow</name>
+ <value>true</value>
+ <description>Some servers return HTTP status 403 (Forbidden) if
+ /robots.txt doesn't exist. This should probably mean that we are
+ allowed to crawl the site nonetheless. If this is set to false,
+ then such sites will be treated as forbidden.</description>
+</property>
+
+<property>
<name>http.agent.description</name>
<value>Nutch</value>
<description>Further description of our bot- this text is used in
@@ -745,6 +754,40 @@
<value>1.0</value>
<description> Used as a boost for phrase in Lucene query.
Multiplied by boost for field phrase is matched in.
+ </description>
+</property>
+
+<!-- language-identifier plugin properties -->
+
+<property>
+ <name>lang.ngram.min.length</name>
+ <value>1</value>
+ <description> The minimum size of ngrams to uses to identify
+ language (must be between 1 and lang.ngram.max.length).
+ The larger is the range between lang.ngram.min.length and
+ lang.ngram.max.length, the better is the identification, but
+ the slowest it is.
+ </description>
+</property>
+
+<property>
+ <name>lang.ngram.max.length</name>
+ <value>4</value>
+ <description> The maximum size of ngrams to uses to identify
+ language (must be between lang.ngram.min.length and 4).
+ The larger is the range between lang.ngram.min.length and
+ lang.ngram.max.length, the better is the identification, but
+ the slowest it is.
+ </description>
+</property>
+
+<property>
+ <name>lang.analyze.max.length</name>
+ <value>2048</value>
+ <description> The maximum bytes of data to uses to indentify
+ the language (0 means full content analysis).
+ The larger is this value, the better is the analysis, but the
+ slowest it is.
</description>
</property>
Modified: lucene/nutch/branches/mapred/site/mailing_lists.html
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/site/mailing_lists.html?rev=209663&r1=209662&r2=209663&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/site/mailing_lists.html (original)
+++ lucene/nutch/branches/mapred/site/mailing_lists.html Thu Jul 7 15:18:08 2005
@@ -200,20 +200,20 @@
<p>If you use Nutch, please subscribe to the Nutch user mailing list.</p>
<p>
The Nutch user mailing list is :
- <a href="mailto:nutch-user@incubator.apache.org">nutch-user@incubator.apache.org</a>.
+ <a href="mailto:nutch-user@lucene.apache.org">nutch-user@lucene.apache.org</a>.
</p>
<ul>
<li>
-<a href="mailto:nutch-user-subscribe@incubator.apache.org">Subscribe to List</a>
+<a href="mailto:nutch-user-subscribe@lucene.apache.org">Subscribe to List</a>
</li>
<li>
-<a href="mailto:nutch-user-unsubscribe@incubator.apache.org">Unsubscribe from List</a>
+<a href="mailto:nutch-user-unsubscribe@lucene.apache.org">Unsubscribe from List</a>
</li>
<li>
-<a href="http://incubator.apache.org/mail/nutch-user/">View List Archive</a>
+<a href="http://lucene.apache.org/mail/nutch-user/">View List Archive</a>
</li>
</ul>
@@ -231,20 +231,20 @@
Nutch developer mailing list.</p>
<p>
The Nutch developer mailing list is :
- <a href="mailto:nutch-dev@incubator.apache.org">nutch-dev@incubator.apache.org</a>.
+ <a href="mailto:nutch-dev@lucene.apache.org">nutch-dev@lucene.apache.org</a>.
</p>
<ul>
<li>
-<a href="mailto:nutch-dev-subscribe@incubator.apache.org">Subscribe to List</a>
+<a href="mailto:nutch-dev-subscribe@lucene.apache.org">Subscribe to List</a>
</li>
<li>
-<a href="mailto:nutch-dev-unsubscribe@incubator.apache.org">Unsubscribe from List</a>
+<a href="mailto:nutch-dev-unsubscribe@lucene.apache.org">Unsubscribe from List</a>
</li>
<li>
-<a href="http://incubator.apache.org/mail/nutch-dev/">View List Archive</a>
+<a href="http://lucene.apache.org/mail/nutch-dev/">View List Archive</a>
</li>
</ul>
@@ -263,15 +263,15 @@
<ul>
<li>
-<a href="mailto:nutch-commits-subscribe@incubator.apache.org">Subscribe to List</a>
+<a href="mailto:nutch-commits-subscribe@lucene.apache.org">Subscribe to List</a>
</li>
<li>
-<a href="mailto:nutch-commits-unsubscribe@incubator.apache.org">Unsubscribe from List</a>
+<a href="mailto:nutch-commits-unsubscribe@lucene.apache.org">Unsubscribe from List</a>
</li>
<li>
-<a href="http://incubator.apache.org/mail/nutch-commits/">View List Archive</a>
+<a href="http://lucene.apache.org/mail/nutch-commits/">View List Archive</a>
</li>
</ul>
@@ -287,20 +287,20 @@
about the Nutch crawler.</p>
<p>
The Nutch agent mailing list is :
- <a href="mailto:nutch-agent@incubator.apache.org">nutch-agent@incubator.apache.org</a>.
+ <a href="mailto:nutch-agent@lucene.apache.org">nutch-agent@lucene.apache.org</a>.
</p>
<ul>
<li>
-<a href="mailto:nutch-agent-subscribe@incubator.apache.org">Subscribe to List</a>
+<a href="mailto:nutch-agent-subscribe@lucene.apache.org">Subscribe to List</a>
</li>
<li>
-<a href="mailto:nutch-agent-unsubscribe@incubator.apache.org">Unsubscribe from List</a>
+<a href="mailto:nutch-agent-unsubscribe@lucene.apache.org">Unsubscribe from List</a>
</li>
<li>
-<a href="http://incubator.apache.org/mail/nutch-agent/">View List Archive</a>
+<a href="http://lucene.apache.org/mail/nutch-agent/">View List Archive</a>
</li>
</ul>
Modified: lucene/nutch/branches/mapred/site/mailing_lists.pdf
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/site/mailing_lists.pdf?rev=209663&r1=209662&r2=209663&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/site/mailing_lists.pdf (original)
+++ lucene/nutch/branches/mapred/site/mailing_lists.pdf Thu Jul 7 15:18:08 2005
@@ -69,10 +69,10 @@
>>
endobj
16 0 obj
-<< /Length 2391 /Filter [ /ASCII85Decode /FlateDecode ]
+<< /Length 2383 /Filter [ /ASCII85Decode /FlateDecode ]
>>
stream
-Gb!l!gN)%,&:N/3n?'u0%1s_-b>XV3l*SoOg.(="h<Du...@0C5r>QK#S?Y/Je0hs)Zc1('r][6eNY[m$OSGY4395XkfsE`-mg"*"g]saTU,CP;uM?cU+m=CHKn%<h5]5d=Cj:bjTgHAQPOBkQhqCcOW:m8["(1KWI;q3\`2mP[_"Go@;OQ,!&5iB%?(:3f@iKlG?<iQPL(pubnEkC$j"3t!2KO(VRgAMTGD8j01M^&:1NC.cE8cRGYYj$"ia\n(rMef*f4AR#;.P]TU$,Ahp^Z.7,?t)ab*1U+[FBVJ;n=3Es$rUW<0J(ZsdJ-.tVWaf.!;ro<erK.qVPT$IhE"'UqJdR#Dar4AoB_4o=[0:fS%0QclW`>1SD:e7H*gA^tBX!Z&ZEcR!!60i)]\a=sV><\"Y"hDB2\'G@09OZ<)[A]A`1!%@b1oZedEXJ9J8.iC0`PR;;dBV2Ptkb]eOjc-_;QTTo,aq\4HLV-6j/JO%%V4f,`X+_N%UP0Ccj?dl"=g)>A&h[]?F7J!mqh;O2==>!%@lmeb+47=!Vj')j&b4hgCqK:42emWC`LSRJY9.f?`#:`+]%9AFHtm>BZXf8ajgo[XLYS^0At*N5dn2Qdg<apOBb"DQPo2AgB"gt\P@OT"iFk1]M9gcN9Cp[LYJB$"SXVaB+H(;PGlL!M5[0Z=,rURc!1M5Ah?V`>][.'Qdoen'3:dF[);p6OHWnT).m'P':bqEp%'j5NLH+rFg9INlK_6NNjMa=QjWQS;;g?ZML!@2"Ij.B8lRHZC=Jf_HM6MeL>(k!3+UDLaN"CDAe<)lZM\)$?/,!.n!SSI:CK7'!l\mjhbT0a<e`Or2JB,CN,2/Q;g)e@u$O[Er6^aLY5sd>[EA>Ti=X4+nb`!..Dj!/U/R/pc$E'js5"S+4UGT/TKd*o\<iIm!UtDiZ8m_H1[P,*4C//M\QK6,@jTb):!H)>40Rke$D_aul6<G0O7,KbBe67)D9GL9#R9UDWOQA:!<eVa*o>[ach:akT&Ujc@jSAI7bq:qM?%A7[-P&=L9\G->Be"G_3-cM/.UZW13(Y]g)CKcK'.S'KWH#L'==>!%0/cRsa%=(As,h;1d"*m=Z%n:e]DQbCX'+&m6BCmR]>?Fj$$p*&=D.N,`;X5(hG1DK2u0I1*1NN-`=$sr&3:r\GCmMf#n$hm@UglE$HhoOb[d/T-K6'U&3#*.RNNY:oE#":$t+aV+d?@LM?GlP1d@E3pTSGAGa`IT&bdD-'MI@R(4oO7&mRVe<H9"'^2Q%=>$*WY:@?21,#NBZ:.Ot"l'?,C2L8OOEC_kNHYLWNDjl$ldWukE-O+o6aqK02H`B=0Mjq,?(/9foeig;7j']DQ[]U1"Lo*V.>Tn\1p<<ljGD.'ZZ`XiO$)_C=[25KAiOi2+MR:[=_TpqCUY_;?H*oG=$p$6B@Y4o#W&CV5'6<0N@KZ=+An^WAOUU4"N&")P&4$R(B8#J1_iAC<B8EP(Be#<&e.C%GcDss=8F;?lm*%=$h"C)sd=?K8,mObRmHP"m-:][>3FEA/NY_$oo*<bB-?k7gKS_KOo\EqGXbAjT<7*OfaW2Q2=[KH0<@Ca,mg_G.Xkf[!`$Hli6VgA6InG!EcV[L8AtNW*/M.-)b`rL=s)+%:*[J]W02?g9q56ijZ<8@$58<F],AMOG6s?H&mY@j\\hY\f,1\<KE-p$?73eEe$lc"RTTgP.U*X!p90M/Eaii38RiqY*l&ZU8jpA^rZ;#d#J-iu=Ip,52=HnQraOREB9cJC3=/PrlS<Oc\P6bZHT:[XGbaZjN$Jk-T6()`^#Tg@X+kV&YM*b2r@=;D:r2`u$*N2bbT%hpO>4H"06f6i3^KZ1,#2@tmC$`IYZp;qX__4fis!!c6j^M?)plbfM!KH,/r<h%H*.XT'e[NTUjU][+\j'.4l]1cT+hOn\YJc2jmDYRW[b'+Qo3M$r0$q]:r1%WUp!sZgG!,&JL:]k*8XU9Rad2.Eq\R$W,Vo~>
+Gb!l!gN)%,&:N/3n?'u0%2%M]:JK97dnZBH)l531V^"j!`2Q-c,\>SWhZ\k[ncBGid\fj[YlJdi3=@CS4dOs-(<cH76[drTX51X8p'^/qn;fVe$Zi$1&aG(qlK%k_6cgkAQ3D[8k#7P2eLTfYBD!Jl":eup68t?1i^[kT%m(g0>b(KF>buc9iEU:2?gI7"0jWi6X7LbI.C4@X<>XllTo[a_.J;o3cAl&Af+Z3+Wej9FRtU)?-B_J:LTa3NV-gtQXP^,qT,0)ao@s#bK)(Xa@I++g6JTXB3F8IN%$-EAiZN95S<&S8(N3A1ZJ7N6^I5jdjBV@0(]2JI]Lh^ZX<6#_6V<5V08'b6+G_7;fu2c*WR\uR.A4:S3^'4;KT9,*&15=+"?L%D@6(8*2]'L[jGQ<VJ9WW2'8,D^76.Y1&hP938F3*(X`==m+]f4N.Mr6*4b,5<mI?/98ui<KC3e?jbNsf^9B8jR$imlj"$i@f&'f].R4;%Vr2H[l5ul<c#tTL/87C>SgO6`QB'LiO$+$cbfNhj%8eT0biki8.l;6R7.Bhk`X%V,`fp%5r/aEH`54N<""Me6m#&:i`Z8e8*G%!C,1s%NVLq3SF#*YJ?JrtUfW1gX:N<_J'e^caqY.EQ5kZV&SchIML$g)^[8XQ$XYe#?g5$!0ZdO,]NQ]%&_PN5)]YQk0?eD0!e!$tPIC8#=M`_KP;gRbKF4C]aS(I4;0S59s!Kt\upL[6`9X"8LS[]'&f]:2/n5J1;T#:6b<-+DA-[o>68asS?ji3DI..#s]CGC!d<PnE%s6cX-A[9Eu@;5[*YHb:oi\75LQINWrL'-6UV)Sn1`9cU!]gEJ[dCaQaX6!\c&mJHS%i087N'DeE$dr*2o&tdUnOh]F5%]7W?$8$9AI1fbfA$Ze#A0c4&g0jf3\1C`1(VsTCa:F>VfZ6u`S%tBoE)3peGcpqMU>4bJ't;X&"n#NQNAb]l;9tgOqV43hUfFfTT>r+!=c(gD3rQ/2IAQWjSJ0_4k(:a7)=fE;VN9uHGd&kU^)rJfKqf.?X"@Y/^C<4_7:&=skM#G)IqX)81<iHsO3^/V3Q$-sA'=F0mfU6MItIGH+;A@VRY']MQ^G"_57.PTdN6K&g>;i(e8%%a>)?+L(YKl`#tQ17\(ONnK,-"Q(.A!23HJ=V_:]KiS#lk2,3!;K-52%TKOHT%oPIDpnuN?g7)GSKC[D:=.l=G0Kd*WTAuKe<Yfs7+6XC0Hp/MmWk1nF/8BtIBA\]A/eEdVVOs,dA;231*+UDL_N"CD9e.HQVMD/$e9kEla)F>@;H*o+9WL^:`SSL_Fm7M0TnHZ7GXbCb93brd^B.g8C?rtZj$[[R&_eFR`5sc."aqu;u(9+([_BWKhGace9Oq<;JSQW?M'WIW^8tF/mS._(D(GJHf^s?Y:\P_Vn][S?T?k=!o/i61!dF1S,o:`*Lp'$G=e8hNcK',"b17+=GV$hp0EY84YUZ?^IB_q9!a=&cV4IF@F6Up<GEB7\(X@m#b7tR+6iKC2T]s0T!o6jeN5I!XqB6hg]HrV'09[AFPKtLLj]as<g_;6Ml7A60TE=j)mAbcVo0e78XEK:<[Rm0;8&.]n0!<<^c0Z2*G9!AXkJf>e8*)2$3fUOO]AfF&^4U<-e-$3W)_($-*Y+D[T4cMR/HoMI:=EO5,gl]?>U?#.CNp8pY/_A?doX+VVOQP-gdZiMC2"um(eMaj2%/)<:-hqND`'<-PR^JI,9[6no3\gVJY?VrL1PeK<diqIt&4$TRAq]B;_2dSZ1Pgj?6*o1s$d`L)?E4E%>%c&ekLs7#hNh>fSKVMPH8GFjK;WFZG)1JN3HJW*!IH$>ZT!P.L+m=2)(%_,+G=Yh6M(^q8$msn`)P,@Q,sSP#\aS-eZ)HAG/b):Xu025Q)9YjOEgKl#2'4JdKuBiB.g8E@#niM(8_Z7LXZ/J;huQLG;DSX9jpPoKdjr>nMHNQ'I!fc5I5*)-rN-F$9OU/R6jc&92oJW<gLW&PkluAO':3qq_Xr6A18@(?hOElhcMYDK!a;Jj[i7Ds3tah29VcKL3!677(mJh8?8i3D/ao*f*4ZJ`0qK"mRipcB]K8M@:-sR8_JVE3h\E<lM3n?%T`o%IIQR#6Wt\&-hKIE2oW<1-N*T9Bam48=W)CX;"mgGD.pIbRJ,fb>1Lth,)l2CEN\[aD`0C'FdL,QA`eV&n=Y,f^r=c)'WQbApfVb2ZX0n[Jfb"VPI"W'ik]1rJ`A]e0)rKjG.GMc3,kQmJQ=j`?_1[*74_rl`NXUWQNfDL)P`;G'W@9B";gI*08@.j*hLPkJ#G6U*=F8^G25EE`+M^0i(\:Ilg`#)=$Nf[2"SF@kM%lO)M5hd7m1Oq.l:#%[!YQV6nd?1-A`AY~>
endstream
endobj
17 0 obj
@@ -103,10 +103,10 @@
19 0 obj
<< /Type /Annot
/Subtype /Link
-/Rect [ 241.668 608.466 404.004 596.466 ]
+/Rect [ 241.668 608.466 390.0 596.466 ]
/C [ 0 0 0 ]
/Border [ 0 0 0 ]
-/A << /URI (mailto:nutch-user@incubator.apache.org)
+/A << /URI (mailto:nutch-user@lucene.apache.org)
/S /URI >>
/H /I
>>
@@ -117,7 +117,7 @@
/Rect [ 108.0 578.066 189.336 566.066 ]
/C [ 0 0 0 ]
/Border [ 0 0 0 ]
-/A << /URI (mailto:nutch-user-subscribe@incubator.apache.org)
+/A << /URI (mailto:nutch-user-subscribe@lucene.apache.org)
/S /URI >>
/H /I
>>
@@ -128,7 +128,7 @@
/Rect [ 108.0 564.866 215.988 552.866 ]
/C [ 0 0 0 ]
/Border [ 0 0 0 ]
-/A << /URI (mailto:nutch-user-unsubscribe@incubator.apache.org)
+/A << /URI (mailto:nutch-user-unsubscribe@lucene.apache.org)
/S /URI >>
/H /I
>>
@@ -139,7 +139,7 @@
/Rect [ 108.0 551.666 197.316 539.666 ]
/C [ 0 0 0 ]
/Border [ 0 0 0 ]
-/A << /URI (http://incubator.apache.org/mail/nutch-user/)
+/A << /URI (http://lucene.apache.org/mail/nutch-user/)
/S /URI >>
/H /I
>>
@@ -147,10 +147,10 @@
23 0 obj
<< /Type /Annot
/Subtype /Link
-/Rect [ 268.992 434.522 428.664 422.522 ]
+/Rect [ 268.992 434.522 414.66 422.522 ]
/C [ 0 0 0 ]
/Border [ 0 0 0 ]
-/A << /URI (mailto:nutch-dev@incubator.apache.org)
+/A << /URI (mailto:nutch-dev@lucene.apache.org)
/S /URI >>
/H /I
>>
@@ -161,7 +161,7 @@
/Rect [ 108.0 404.122 189.336 392.122 ]
/C [ 0 0 0 ]
/Border [ 0 0 0 ]
-/A << /URI (mailto:nutch-dev-subscribe@incubator.apache.org)
+/A << /URI (mailto:nutch-dev-subscribe@lucene.apache.org)
/S /URI >>
/H /I
>>
@@ -172,7 +172,7 @@
/Rect [ 108.0 390.922 215.988 378.922 ]
/C [ 0 0 0 ]
/Border [ 0 0 0 ]
-/A << /URI (mailto:nutch-dev-unsubscribe@incubator.apache.org)
+/A << /URI (mailto:nutch-dev-unsubscribe@lucene.apache.org)
/S /URI >>
/H /I
>>
@@ -183,7 +183,7 @@
/Rect [ 108.0 377.722 197.316 365.722 ]
/C [ 0 0 0 ]
/Border [ 0 0 0 ]
-/A << /URI (http://incubator.apache.org/mail/nutch-dev/)
+/A << /URI (http://lucene.apache.org/mail/nutch-dev/)
/S /URI >>
/H /I
>>
@@ -205,7 +205,7 @@
/Rect [ 108.0 251.378 189.336 239.378 ]
/C [ 0 0 0 ]
/Border [ 0 0 0 ]
-/A << /URI (mailto:nutch-commits-subscribe@incubator.apache.org)
+/A << /URI (mailto:nutch-commits-subscribe@lucene.apache.org)
/S /URI >>
/H /I
>>
@@ -216,7 +216,7 @@
/Rect [ 108.0 238.178 215.988 226.178 ]
/C [ 0 0 0 ]
/Border [ 0 0 0 ]
-/A << /URI (mailto:nutch-commits-unsubscribe@incubator.apache.org)
+/A << /URI (mailto:nutch-commits-unsubscribe@lucene.apache.org)
/S /URI >>
/H /I
>>
@@ -227,16 +227,16 @@
/Rect [ 108.0 224.978 197.316 212.978 ]
/C [ 0 0 0 ]
/Border [ 0 0 0 ]
-/A << /URI (http://incubator.apache.org/mail/nutch-commits/)
+/A << /URI (http://lucene.apache.org/mail/nutch-commits/)
/S /URI >>
/H /I
>>
endobj
31 0 obj
-<< /Length 858 /Filter [ /ASCII85Decode /FlateDecode ]
+<< /Length 853 /Filter [ /ASCII85Decode /FlateDecode ]
>>
stream
-GatU29okbt&A@ZcHqY#)[_ZA7eD7.(<4,`<TtN9GO=VPG$)Bh0eULq9^ioKFZrJa&^qBDRk2GeH=S@UR9HXDQhEi?n0L4WQ!Cn?s;$&oAK&?c\c%8;>#7El!)mc`7Ed,l=(e8;>_c)pBP;LjI@o=/QP<rV0kMS9)5$8s/l]5%EK;XdG,-DG>NsQ+0\jF7?\CcF)>d]tE5`ehb]RmQah0KL`l9#9N$]SUPP#g9Q;Y%FFh-IHtRH6+bLV#nVZTN+>or$bf8M!.cZH4k+hEou@FgJ<'[T?Q?rL/=Gkd-S,&3;QeN*,[.P-_uh6AI7[l-8@:c+Q-FGQa2<_=_ADm.+[+NCi>1<[-2'Lctt-6?q%pgi3TG=HsefbA9jf#?DE3BUYk)d4#V<N'kYUma.t+%BZ8a)ppG[k+i#8SfRCUU1j9WlFIp8'.^!T>7LCiW>0Em<9B0@6X1*qZN!IR>A!2_edJ"40sn_+"F$Pl]l==!JT[q8HCniJ0sr%[MTVQgl9k=e#_M;__qIFoWM+LRD?R:pB7kB>V#P3ib/JpO\)F9QTi]RcE?OF4A_q%/MOh-qr_0'<1%l)]`U?OmpE@1%KK(K_'4;M@BiV!a>%;bDLTh(i+2X4@6\fUcU+t#3[Aq_QqiL7Va\+I$2M;uRj71C\:mU:@D%E&WY)I$52REp_O^XaT(dZs29RWf5Pm:o.oQNltW('urcSi3W$l6%7:Ot%r\.Ku#j:U%RL)[BWa,i@lof"$r'02q+e9D]`ik*]&rP^h.7fa9!pbg:mm//e<i^9]M2M>N[M5aAn2'6OTn/qt8j)qd`?c1[?XKkLq*0s0^6DNmle:H[5N&W;]r$?ZDH><H[./HU[~>
+GatU29okbt&A@ZcHqY#)[_ZA7eD7.(<4,...@c>aui6tP6;[YgW`mJT_qb8>S]XlB)Pq>1_0(IHo?e[%IITsUgt6`mW'U$L;5Y-6_SWhJaf"4=XA[TZ[GW-ua2EUk"X3=VYdf/UL4`FX<ug31e>G%Eto0VE9-N!t'W`sC\UA*U27Y7*,^U0g:Q6oETUHE`MX@Bb!lHjX-SIuPSE+:l<&Fk__e3JQ='W4rYTR`I3pUW06o&`9GIV([#64URQ9iR]dPZd[eoM;hECD$\*A&gO56C>R(ti'@iNa6RPK?FID,=_6dgAQpKa$B;@/5mNRb0]K;Bo@NI8lDa^a^5,3C~>
endstream
endobj
32 0 obj
@@ -259,10 +259,10 @@
34 0 obj
<< /Type /Annot
/Subtype /Link
-/Rect [ 247.668 660.8 416.004 648.8 ]
+/Rect [ 247.668 660.8 402.0 648.8 ]
/C [ 0 0 0 ]
/Border [ 0 0 0 ]
-/A << /URI (mailto:nutch-agent@incubator.apache.org)
+/A << /URI (mailto:nutch-agent@lucene.apache.org)
/S /URI >>
/H /I
>>
@@ -273,7 +273,7 @@
/Rect [ 108.0 630.4 189.336 618.4 ]
/C [ 0 0 0 ]
/Border [ 0 0 0 ]
-/A << /URI (mailto:nutch-agent-subscribe@incubator.apache.org)
+/A << /URI (mailto:nutch-agent-subscribe@lucene.apache.org)
/S /URI >>
/H /I
>>
@@ -284,7 +284,7 @@
/Rect [ 108.0 617.2 215.988 605.2 ]
/C [ 0 0 0 ]
/Border [ 0 0 0 ]
-/A << /URI (mailto:nutch-agent-unsubscribe@incubator.apache.org)
+/A << /URI (mailto:nutch-agent-unsubscribe@lucene.apache.org)
/S /URI >>
/H /I
>>
@@ -295,7 +295,7 @@
/Rect [ 108.0 604.0 197.316 592.0 ]
/C [ 0 0 0 ]
/Border [ 0 0 0 ]
-/A << /URI (http://incubator.apache.org/mail/nutch-agent/)
+/A << /URI (http://lucene.apache.org/mail/nutch-agent/)
/S /URI >>
/H /I
>>
@@ -414,53 +414,53 @@
xref
0 48
0000000000 65535 f
-0000009417 00000 n
-0000009489 00000 n
-0000009581 00000 n
+0000009354 00000 n
+0000009426 00000 n
+0000009518 00000 n
0000000015 00000 n
0000000071 00000 n
0000000619 00000 n
0000000739 00000 n
0000000785 00000 n
-0000009704 00000 n
+0000009641 00000 n
0000000920 00000 n
-0000009767 00000 n
+0000009704 00000 n
0000001057 00000 n
-0000009833 00000 n
+0000009770 00000 n
0000001194 00000 n
-0000009899 00000 n
+0000009836 00000 n
0000001331 00000 n
-0000003815 00000 n
-0000003938 00000 n
-0000004042 00000 n
-0000004234 00000 n
-0000004434 00000 n
-0000004636 00000 n
-0000004832 00000 n
-0000005023 00000 n
-0000005222 00000 n
-0000005423 00000 n
-0000005618 00000 n
-0000005792 00000 n
-0000005995 00000 n
-0000006200 00000 n
-0000006399 00000 n
-0000007349 00000 n
-0000007472 00000 n
-0000007520 00000 n
-0000007709 00000 n
-0000007906 00000 n
-0000008105 00000 n
-0000009965 00000 n
-0000008298 00000 n
-0000008419 00000 n
-0000008585 00000 n
-0000008733 00000 n
-0000008861 00000 n
-0000008974 00000 n
-0000009084 00000 n
-0000009192 00000 n
-0000009308 00000 n
+0000003807 00000 n
+0000003930 00000 n
+0000004034 00000 n
+0000004221 00000 n
+0000004418 00000 n
+0000004617 00000 n
+0000004810 00000 n
+0000004997 00000 n
+0000005193 00000 n
+0000005391 00000 n
+0000005583 00000 n
+0000005757 00000 n
+0000005957 00000 n
+0000006159 00000 n
+0000006355 00000 n
+0000007300 00000 n
+0000007423 00000 n
+0000007471 00000 n
+0000007655 00000 n
+0000007849 00000 n
+0000008045 00000 n
+0000009902 00000 n
+0000008235 00000 n
+0000008356 00000 n
+0000008522 00000 n
+0000008670 00000 n
+0000008798 00000 n
+0000008911 00000 n
+0000009021 00000 n
+0000009129 00000 n
+0000009245 00000 n
trailer
<<
/Size 48
@@ -468,5 +468,5 @@
/Info 4 0 R
>>
startxref
-10016
+9953
%%EOF
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java?rev=209663&r1=209662&r2=209663&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java Thu Jul 7 15:18:08 2005
@@ -92,11 +92,16 @@
case ProtocolStatus.MOVED: // redirect
case ProtocolStatus.TEMP_MOVED:
- url = status.getMessage();
- if (url != null) {
+ String newUrl = status.getMessage();
+ newUrl = URLFilters.filter(newUrl);
+ if (newUrl != null && !newUrl.equals(url)) {
+ url = newUrl;
redirecting = true;
redirectCount++;
- LOG.fine(" - protocol redirect to " + url);
+ LOG.fine(" - redirect to " + url);
+ } else {
+ LOG.fine(" - redirect skipped: " +
+ (url.equals(newUrl) ? "to same url" : "filtered"));
}
break;
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=209663&r1=209662&r2=209663&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/fetcher/Fetcher.java Thu Jul 7 15:18:08 2005
@@ -115,7 +115,7 @@
if (!fle.getFetch()) { // should we fetch this page?
if (LOG.isLoggable(Level.FINE))
LOG.fine("not fetching " + url);
- handleNoFetch(fle, ProtocolStatus.STATUS_NOTFETCHING);
+ handleFetch(fle, new ProtocolOutput(null, ProtocolStatus.STATUS_NOTFETCHING));
continue;
}
@@ -124,7 +124,7 @@
// in parsing mode). Protocol-level redirects take precedence over
// content-level redirects. Some plugins can handle redirects
// automatically, so that only the final success or failure will be
- // shown? here.
+ // reported here.
boolean refetch = false;
int redirCnt = 0;
do {
@@ -145,15 +145,19 @@
status();
}
}
- ParseStatus ps = handleFetch(url, fle, output);
+ ParseStatus ps = handleFetch(fle, output);
if (ps != null && ps.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
- url = ps.getMessage();
- url = URLFilters.filter(url);
- if (url != null) {
+ String newurl = ps.getMessage();
+ newurl = URLFilters.filter(newurl);
+ if (newurl != null && !newurl.equals(url)) {
refetch = true;
+ url = newurl;
redirCnt++;
fle = new FetchListEntry(true, new Page(url, NEW_INJECTED_PAGE_SCORE), new String[0]);
- LOG.info(" - content redirect to " + url);
+ LOG.fine(" - content redirect to " + url);
+ } else {
+ LOG.fine(" - content redirect skipped, " +
+ (url.equals(newurl)? "newurl == url" : "prohibited by urlfilter"));
}
}
}
@@ -161,14 +165,19 @@
case ProtocolStatus.MOVED: // try to redirect immediately
case ProtocolStatus.TEMP_MOVED: // try to redirect immediately
// record the redirect. perhaps the DB will want to know this.
- handleNoFetch(fle, pstat);
- url = pstat.getMessage();
- if (url != null) {
+ handleFetch(fle, output);
+ String newurl = pstat.getMessage();
+ newurl = URLFilters.filter(newurl);
+ if (newurl != null && !newurl.equals(url)) {
refetch = true;
+ url = newurl;
redirCnt++;
// create new entry.
fle = new FetchListEntry(true, new Page(url, NEW_INJECTED_PAGE_SCORE), new String[0]);
LOG.info(" - protocol redirect to " + url);
+ } else {
+ LOG.fine(" - protocol redirect skipped, " +
+ (url.equals(newurl)? "newurl == url" : "prohibited by urlfilter"));
}
break;
case ProtocolStatus.GONE:
@@ -177,22 +186,22 @@
case ProtocolStatus.ROBOTS_DENIED:
case ProtocolStatus.RETRY:
case ProtocolStatus.NOTMODIFIED:
- handleNoFetch(fle, pstat);
+ handleFetch(fle, output);
break;
case ProtocolStatus.EXCEPTION:
logError(url, fle, new Exception(pstat.getMessage())); // retry?
- handleNoFetch(fle, pstat);
+ handleFetch(fle, output);
break;
default:
LOG.warning("Unknown ProtocolStatus: " + pstat.getCode());
- handleNoFetch(fle, pstat);
+ handleFetch(fle, output);
}
} while (refetch && (redirCnt < MAX_REDIRECT));
} catch (Throwable t) { // an unchecked exception
if (fle != null) {
logError(url, fle, t); // retry?
- handleNoFetch(fle, new ProtocolStatus(t));
+ handleFetch(fle, new ProtocolOutput(null, new ProtocolStatus(t)));
}
}
}
@@ -220,59 +229,47 @@
}
}
- private ParseStatus handleFetch(String url, FetchListEntry fle, ProtocolOutput output) {
+ private ParseStatus handleFetch(FetchListEntry fle, ProtocolOutput output) {
Content content = output.getContent();
+ MD5Hash hash = null;
+ String url = fle.getPage().getURL().toString();
+ if (content == null) {
+ content = new Content(url, url, new byte[0], "", new Properties());
+ hash = MD5Hash.digest(url);
+ } else {
+ hash = MD5Hash.digest(content.getContent());
+ }
ProtocolStatus protocolStatus = output.getStatus();
if (!Fetcher.this.parsing) {
- outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()),
- protocolStatus),
+ outputPage(new FetcherOutput(fle, hash, protocolStatus),
content, null, null);
return null;
}
-
- String contentType = content.getContentType();
- Parser parser = null;
- Parse parse = null;
- ParseStatus status = null;
- try {
- parser = ParserFactory.getParser(contentType, url);
- parse = parser.getParse(content);
- status = parse.getData().getStatus();
- } catch (Exception e) {
- e.printStackTrace();
- status = new ParseStatus(e);
- }
- if (status.isSuccess()) {
- outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()),
- protocolStatus),
- content, new ParseText(parse.getText()), parse.getData());
- } else {
- LOG.info("fetch okay, but can't parse " + url + ", reason: "
- + status.toString());
- outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()),
- protocolStatus),
- content, new ParseText(""),
- new ParseData(status, "", new Outlink[0], new Properties()));
- }
- return status;
- }
-
- private void handleNoFetch(FetchListEntry fle, ProtocolStatus status) {
- String url = fle.getPage().getURL().toString();
- MD5Hash hash = MD5Hash.digest(url);
-
- if (Fetcher.this.parsing) {
- outputPage(new FetcherOutput(fle, hash, status),
- new Content(url, url, new byte[0], "", new Properties()),
- new ParseText(""),
- new ParseData(ParseStatus.STATUS_NOTPARSED, "", new Outlink[0], new Properties()));
+ String contentType = content.getContentType();
+ Parser parser = null;
+ Parse parse = null;
+ ParseStatus status = null;
+ try {
+ parser = ParserFactory.getParser(contentType, url);
+ parse = parser.getParse(content);
+ status = parse.getData().getStatus();
+ } catch (Exception e) {
+ e.printStackTrace();
+ status = new ParseStatus(e);
+ }
+ if (status.isSuccess()) {
+ outputPage(new FetcherOutput(fle, hash, protocolStatus),
+ content, new ParseText(parse.getText()), parse.getData());
} else {
- outputPage(new FetcherOutput(fle, hash, status),
- new Content(url, url, new byte[0], "", new Properties()),
- null, null);
+ LOG.info("fetch okay, but can't parse " + url + ", reason: "
+ + status.toString());
+ outputPage(new FetcherOutput(fle, hash, protocolStatus),
+ content, new ParseText(""),
+ new ParseData(status, "", new Outlink[0], new Properties()));
}
+ return status;
}
-
+
private void outputPage(FetcherOutput fo, Content content,
ParseText text, ParseData parseData) {
try {
@@ -363,8 +360,8 @@
for (int i = 0; i < n; i++) {
// this thread may have gone away in the meantime
if (list[i] == null) continue;
- String name = list[i].getName();
- if (name.startsWith(THREAD_GROUP_NAME)) // prove it
+ String tname = list[i].getName();
+ if (tname.startsWith(THREAD_GROUP_NAME)) // prove it
noMoreFetcherThread = false;
if (LOG.isLoggable(Level.FINE))
LOG.fine(list[i].toString());
@@ -446,7 +443,6 @@
/** Run the fetcher. */
public static void main(String[] args) throws Exception {
int threadCount = -1;
- long delay = -1;
String logLevel = "info";
boolean parsing = true;
boolean showThreadID = false;
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/ParseStatus.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/ParseStatus.java?rev=209663&r1=209662&r2=209663&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/ParseStatus.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/ParseStatus.java Thu Jul 7 15:18:08 2005
@@ -66,25 +66,6 @@
public static final ParseStatus STATUS_SUCCESS = new ParseStatus(SUCCESS);
public static final ParseStatus STATUS_FAILURE = new ParseStatus(FAILED);
-
- private static class EmptyParseImpl implements Parse {
-
- private ParseData data = null;
-
- public EmptyParseImpl(ParseStatus status) {
- data = new ParseData(status, "", new Outlink[0], new Properties());
- }
-
- public ParseData getData() {
- return data;
- }
-
- public String getText() {
- return "";
- }
- }
-
-
private byte majorCode = 0;
private short minorCode = 0;
private String[] args = null;
@@ -187,7 +168,10 @@
public String toString() {
StringBuffer res = new StringBuffer();
- res.append(majorCodes[majorCode] + "(" + majorCode + "," + minorCode + ")");
+ String name = null;
+ if (majorCode >= 0 && majorCode < majorCodes.length) name = majorCodes[majorCode];
+ else name = "UNKNOWN!";
+ res.append(name + "(" + majorCode + "," + minorCode + ")");
if (args != null) {
if (args.length == 1) {
res.append(": " + String.valueOf(args[0]));
@@ -239,6 +223,23 @@
}
}
return true;
+ }
+
+ private static class EmptyParseImpl implements Parse {
+
+ private ParseData data = null;
+
+ public EmptyParseImpl(ParseStatus status) {
+ data = new ParseData(status, "", new Outlink[0], new Properties());
+ }
+
+ public ParseData getData() {
+ return data;
+ }
+
+ public String getText() {
+ return "";
+ }
}
}
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ProtocolStatus.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ProtocolStatus.java?rev=209663&r1=209662&r2=209663&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ProtocolStatus.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ProtocolStatus.java Thu Jul 7 15:18:08 2005
@@ -19,6 +19,7 @@
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
+import java.util.HashMap;
import org.apache.nutch.io.VersionedWritable;
import org.apache.nutch.io.WritableUtils;
@@ -77,6 +78,24 @@
private long lastModified;
private String[] args;
+ private static HashMap codeToName = new HashMap();
+ static {
+ codeToName.put(new Integer(SUCCESS), "success");
+ codeToName.put(new Integer(FAILED), "failed");
+ codeToName.put(new Integer(PROTO_NOT_FOUND), "proto_not_found");
+ codeToName.put(new Integer(GONE), "gone");
+ codeToName.put(new Integer(MOVED), "moved");
+ codeToName.put(new Integer(TEMP_MOVED), "temp_moved");
+ codeToName.put(new Integer(NOTFOUND), "notfound");
+ codeToName.put(new Integer(RETRY), "retry");
+ codeToName.put(new Integer(EXCEPTION), "exception");
+ codeToName.put(new Integer(ACCESS_DENIED), "access_denied");
+ codeToName.put(new Integer(ROBOTS_DENIED), "robots_denied");
+ codeToName.put(new Integer(REDIR_EXCEEDED), "redir_exceeded");
+ codeToName.put(new Integer(NOTFETCHING), "notfetching");
+ codeToName.put(new Integer(NOTMODIFIED), "notmodified");
+ }
+
public ProtocolStatus() {
}
@@ -215,7 +234,7 @@
public String toString() {
StringBuffer res = new StringBuffer();
- res.append("(" + code + "), lastModified=" + lastModified);
+ res.append(codeToName.get(new Integer(code)) + "(" + code + "), lastModified=" + lastModified);
if (args != null) {
if (args.length == 1) {
res.append(": " + String.valueOf(args[0]));
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/segment/SegmentReader.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/segment/SegmentReader.java?rev=209663&r1=209662&r2=209663&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/segment/SegmentReader.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/segment/SegmentReader.java Thu Jul 7 15:18:08 2005
@@ -528,6 +528,7 @@
cnt++;
if (dump) reader.dump(sorted, System.out);
} catch (Throwable t) {
+ t.printStackTrace();
LOG.warning(t.getMessage());
}
}
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/tools/ParseSegment.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/tools/ParseSegment.java?rev=209663&r1=209662&r2=209663&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/tools/ParseSegment.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/tools/ParseSegment.java Thu Jul 7 15:18:08 2005
@@ -183,31 +183,22 @@
continue;
}
- // if fetch was successful or
- // previously unable to parse (so try again)
- ProtocolStatus ps = fetcherOutput.getProtocolStatus();
- if (ps.isSuccess()) {
- handleContent(url, content);
- synchronized (ParseSegment.this) {
- pages++; // record successful parse
- bytes += content.getContent().length;
- if ((pages % 100) == 0)
- status();
- }
- } else {
- // errored at fetch step
- logError(url, new ProtocolException("Error at fetch stage: " + ps));
- handleNoContent(new ParseStatus(ParseStatus.FAILED_MISSING_CONTENT));
+ handleContent(fetcherOutput, content);
+ synchronized (ParseSegment.this) {
+ pages++; // record successful parse
+ bytes += content.getContent().length;
+ if ((pages % 100) == 0)
+ status();
}
} catch (ParseException e) {
logError(url, e);
- handleNoContent(new ParseStatus(e));
+ handleError(new ParseStatus(e));
} catch (Throwable t) { // an unchecked exception
if (fle != null) {
logError(url, t);
- handleNoContent(new ParseStatus(t));
+ handleError(new ParseStatus(t));
} else {
LOG.severe("Unexpected exception");
}
@@ -224,27 +215,35 @@
}
}
- private void handleContent(String url, Content content)
+ private void handleContent(FetcherOutput fo, Content content)
throws ParseException {
- //String contentType = content.getContentType();
- String contentType = content.getMetadata().getProperty("Content-Type");
+ String url = fo.getUrl().toString();
+ if (content != null) {
+ String contentType = content.getMetadata().getProperty("Content-Type");
+ if (ParseSegment.this.dryRun) {
+ LOG.info("To be handled as Content-Type: "+contentType);
+ return;
+ }
- if (ParseSegment.this.dryRun) {
- LOG.info("To be handled as Content-Type: "+contentType);
- return;
+ Parser parser = ParserFactory.getParser(contentType, url);
+ Parse parse = parser.getParse(content);
+ outputPage(new ParseText(parse.getText()), parse.getData());
+
+ } else {
+ if (ParseSegment.this.dryRun) {
+ LOG.info("To be handled as no content");
+ return;
+ }
+ outputPage(new ParseText(""),
+ new ParseData(new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_MISSING_CONTENT),
+ "", new Outlink[0], new Properties()));
}
-
- Parser parser = ParserFactory.getParser(contentType, url);
- Parse parse = parser.getParse(content);
-
- outputPage
- (new ParseText(parse.getText()), parse.getData());
}
- private void handleNoContent(ParseStatus status) {
+ private void handleError(ParseStatus status) {
if (ParseSegment.this.dryRun) {
- LOG.info("To be handled as no content");
+ LOG.info("To be handled as error");
return;
}
outputPage(new ParseText(""),
@@ -267,6 +266,7 @@
+" wait="+(t4-t3) +" write="+(t5-t4) +"ms");
}
} catch (Throwable t) {
+ t.printStackTrace();
LOG.severe("error writing output:" + t.toString());
}
}
Modified: lucene/nutch/branches/mapred/src/plugin/languageidentifier/build.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/languageidentifier/build.xml?rev=209663&r1=209662&r2=209663&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/languageidentifier/build.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/languageidentifier/build.xml Thu Jul 7 15:18:08 2005
@@ -9,6 +9,10 @@
<copy todir="${build.classes}">
<fileset dir="${src.dir}" includes="**/*.ngp, **/*.properties"/>
</copy>
+ <echo>Copying test files</echo>
+ <copy todir="${build.test}">
+ <fileset dir="${src.test}" includes="**/*.test, **/*.txt"/>
+ </copy>
</target>
</project>
Modified: lucene/nutch/branches/mapred/src/plugin/languageidentifier/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/languageidentifier/plugin.xml?rev=209663&r1=209662&r2=209663&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/languageidentifier/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/languageidentifier/plugin.xml Thu Jul 7 15:18:08 2005
@@ -34,7 +34,7 @@
name="Nutch language identifier filter"
point="org.apache.nutch.indexer.IndexingFilter">
<implementation id="LanguageIdentifier"
- class="org.apache.nutch.analysis.lang.LanguageIdentifier"/>
+ class="org.apache.nutch.analysis.lang.LanguageIndexingFilter"/>
</extension>
Modified: lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java?rev=209663&r1=209662&r2=209663&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java Thu Jul 7 15:18:08 2005
@@ -15,83 +15,153 @@
*/
package org.apache.nutch.analysis.lang;
-import java.io.BufferedReader;
+// JDK imports
import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
import java.io.InputStream;
+import java.io.IOException;
+import java.io.BufferedReader;
+import java.io.FileInputStream;
import java.io.InputStreamReader;
-import java.util.Iterator;
+import java.util.List;
import java.util.Vector;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.ArrayList;
+import java.util.Properties;
+import java.util.Enumeration;
import java.util.logging.Logger;
-import org.apache.nutch.fetcher.FetcherOutput;
-import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.indexer.IndexingFilter;
+// Nutch imports
+import org.apache.nutch.analysis.lang.NGramProfile.NGramEntry;
import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseException;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.ParserFactory;
+import org.apache.nutch.parse.ParseException;
import org.apache.nutch.parse.ParserNotFound;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.ProtocolException;
import org.apache.nutch.protocol.ProtocolFactory;
import org.apache.nutch.protocol.ProtocolNotFound;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.util.NutchConf;
import org.apache.nutch.util.LogFormatter;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import java.util.Properties;
-import java.util.Enumeration;
/**
*
* @author Sami Siren
- *
+ * @author Jerome Charron
*/
-public class LanguageIdentifier implements IndexingFilter {
- public static final Logger LOG = LogFormatter.getLogger("org.apache.nutch.analysis.lang.LanguageIdentifier");
+public class LanguageIdentifier {
+
+
+ private final static int DEFAULT_ANALYSIS_LENGTH = 0; // 0 means full content
+
+ private final static float SCORE_THRESOLD = 0.00F;
+
+ public final static Logger LOG = LogFormatter.getLogger(LanguageIdentifier.class.getName());
+
+
+ private ArrayList languages = new ArrayList();
+
+ private ArrayList supportedLanguages = new ArrayList();
+
+ /** Minimum size of NGrams */
+ private int minLength = NGramProfile.DEFAULT_MIN_NGRAM_LENGTH;
+
+ /** Maximum size of NGrams */
+ private int maxLength = NGramProfile.DEFAULT_MAX_NGRAM_LENGTH;
+
+ /** The maximum amount of data to analyze */
+ private int analyzeLength = DEFAULT_ANALYSIS_LENGTH;
+
+ /** A global index of ngrams of all supported languages */
+ private HashMap ngramsIdx = new HashMap();
- private Vector languages = new Vector();
+ /** The NGramProfile used for identification */
+ private NGramProfile suspect = null;
- private Vector supportedLanguages = new Vector();
+ /** My singleton instance */
+ private static LanguageIdentifier identifier = null;
- private static LanguageIdentifier identifier = new LanguageIdentifier(true);
- private static float SCORE_THRESOLD = 0.00F;
-
- //public constructor needed for extension mechanism
- public LanguageIdentifier() {}
+ /**
+ * Constructs a new Language Identifier.
+ */
+ private LanguageIdentifier() {
- private LanguageIdentifier(boolean fake) {
+ // Gets ngram sizes to take into account from the Nutch Config
+ minLength = NutchConf.get().getInt("lang.ngram.min.length",
+ NGramProfile.DEFAULT_MIN_NGRAM_LENGTH);
+ maxLength = NutchConf.get().getInt("lang.ngram.max.length",
+ NGramProfile.DEFAULT_MAX_NGRAM_LENGTH);
+ // Ensure the min and max values are in an acceptale range
+ // (ie min >= DEFAULT_MIN_NGRAM_LENGTH and max <= DEFAULT_MAX_NGRAM_LENGTH)
+ maxLength = Math.min(maxLength, NGramProfile.ABSOLUTE_MAX_NGRAM_LENGTH);
+ maxLength = Math.max(maxLength, NGramProfile.ABSOLUTE_MIN_NGRAM_LENGTH);
+ minLength = Math.max(minLength, NGramProfile.ABSOLUTE_MIN_NGRAM_LENGTH);
+ minLength = Math.min(minLength, maxLength);
+
+ // Gets the value of the maximum size of data to analyze
+ analyzeLength = NutchConf.get().getInt("lang.analyze.max.length",
+ DEFAULT_ANALYSIS_LENGTH);
+
Properties p = new Properties();
try {
p.load(this.getClass().getResourceAsStream("langmappings.properties"));
Enumeration alllanguages = p.keys();
+
+ LOG.info(new StringBuffer()
+ .append("Language identifier configuration [")
+ .append(minLength).append("-").append(maxLength)
+ .append("/").append(analyzeLength).append("]").toString());
StringBuffer list = new StringBuffer("Language identifier plugin supports:");
+ HashMap tmpIdx = new HashMap();
while (alllanguages.hasMoreElements()) {
String lang = (String) (alllanguages.nextElement());
InputStream is = this.getClass().getClassLoader().getResourceAsStream(
- "org/apache/nutch/analysis/lang/" + lang + "." + NGramProfile.NGRAM_FILE_EXTENSION);
+ "org/apache/nutch/analysis/lang/" + lang + "." + NGramProfile.FILE_EXTENSION);
if (is != null) {
- NGramProfile profile = new NGramProfile(lang);
+ NGramProfile profile = new NGramProfile(lang, minLength, maxLength);
try {
profile.load(is);
languages.add(profile);
supportedLanguages.add(lang);
- list.append(" " + lang);
+ List ngrams = profile.getSorted();
+ for (int i=0; i<ngrams.size(); i++) {
+ NGramEntry entry = (NGramEntry) ngrams.get(i);
+ List registered = (List) tmpIdx.get(entry);
+ if (registered == null) {
+ registered = new ArrayList();
+ tmpIdx.put(entry, registered);
+ }
+ registered.add(entry);
+ entry.setProfile(profile);
+ }
+ list.append(" " + lang + "(" + ngrams.size() + ")");
is.close();
} catch (IOException e1) {
LOG.severe(e1.toString());
}
}
}
+ // transform all ngrams lists to arrays for performances
+ Iterator keys = tmpIdx.keySet().iterator();
+ while (keys.hasNext()) {
+ NGramEntry entry = (NGramEntry) keys.next();
+ List l = (List) tmpIdx.get(entry);
+ if (l != null) {
+ NGramEntry[] array = (NGramEntry[]) l.toArray(new NGramEntry[l.size()]);
+ ngramsIdx.put(entry.getSeq(), array);
+ }
+ }
LOG.info(list.toString());
+ // Create the suspect profile
+ suspect = new NGramProfile("suspect", minLength, maxLength);
} catch (Exception e) {
LOG.severe(e.toString());
}
@@ -101,6 +171,13 @@
* return handle to singleton instance
*/
public static LanguageIdentifier getInstance() {
+ if (identifier == null) {
+ synchronized(LanguageIdentifier.class) {
+ if (identifier == null) {
+ identifier = new LanguageIdentifier();
+ }
+ }
+ }
return identifier;
}
@@ -157,15 +234,24 @@
if (args[i].equals("-identifyfileset")) {
command = IDFILESET;
for (i++; i < args.length; i++) {
- fileset.add(args[i]);
- System.out.println(args[i]);
+ File[] files = null;
+ File f = new File(args[i]);
+ if (f.isDirectory()) {
+ files = f.listFiles();
+ } else {
+ files = new File[] { f };
+ }
+ for (int j=0; j<files.length; j++) {
+ fileset.add(files[j].getAbsolutePath());
+ }
}
}
}
String lang = null;
- LanguageIdentifier idfr = LanguageIdentifier.getInstance();
+ //LanguageIdentifier idfr = LanguageIdentifier.getInstance();
+ LanguageIdentifier idfr = new LanguageIdentifier();
File f;
FileInputStream fis;
try {
@@ -205,9 +291,12 @@
break;
case IDFILESET:
+ /* used for benchs
+ for (int j=128; j<=524288; j*=2) {
+ long start = System.currentTimeMillis();
+ idfr.analyzeLength = j; */
System.out.println("FILESET");
Iterator i = fileset.iterator();
-
while (i.hasNext()) {
try {
filename = (String) i.next();
@@ -218,12 +307,13 @@
} catch (Exception e) {
System.out.println(e);
}
-
System.out.println(filename + " was identified as " + lang);
}
+ /* used for benchs
+ System.out.println(j + "/" + (System.currentTimeMillis()-start));
+ } */
System.exit(0);
break;
-
}
} catch (Exception e) {
System.out.println(e);
@@ -261,46 +351,57 @@
/**
* Identify language based on submitted content
*
- * @param text text of doc
+ * @param text to analyze
* @return 2 letter ISO639 code of language (en, fi, sv...) , or null if
* unknown
*/
public String identify(String text) {
-
return identify(new StringBuffer(text));
}
- public String identify(StringBuffer text) {
+ /**
+ * Identify language based on submitted content
+ *
+ * @param text to analyze
+ * @return 2 letter ISO639 code of language (en, fi, sv...) , or null if
+ * unknown
+ */
+ public String identify(StringBuffer content) {
- NGramProfile p = new NGramProfile("suspect");
- p.analyze(text);
+ StringBuffer text = content;
+ if ((analyzeLength > 0) && (content.length() > analyzeLength)) {
+ text = new StringBuffer().append(content);
+ text.setLength(analyzeLength);
+ }
- float topscore = Float.MAX_VALUE;
+ suspect.analyze(text);
+ Iterator iter = suspect.getSorted().iterator();
+ float topscore = Float.MIN_VALUE;
String lang = "";
-
- Iterator i = languages.iterator();
- while (i.hasNext()) {
-
- NGramProfile profile = (NGramProfile) i.next();
- float score = profile.getSimilarity(p);
-
- //LOG.fine(profile.getName() + ":" + score);
-
- if (score < topscore) {
- topscore = score;
- lang = profile.getName();
- }
+ HashMap scores = new HashMap();
+ NGramEntry searched = null;
+
+ while (iter.hasNext()) {
+ searched = (NGramEntry) iter.next();
+ NGramEntry[] ngrams = (NGramEntry[]) ngramsIdx.get(searched.getSeq());
+ if (ngrams != null) {
+ for (int j=0; j<ngrams.length; j++) {
+ NGramProfile profile = ngrams[j].getProfile();
+ Float pScore = (Float) scores.get(profile);
+ if (pScore == null) {
+ pScore = new Float(0);
+ }
+ float plScore = pScore.floatValue();
+ plScore += ngrams[j].getFrequency() + searched.getFrequency();
+ scores.put(profile, new Float(plScore));
+ if (plScore > topscore) {
+ topscore = plScore;
+ lang = profile.getName();
+ }
+ }
+ }
}
-
- p.ngrams.clear();
- p = null;
-
- LOG.finest("TOPSCORE: " + lang + " with " + topscore);
-
- if (topscore > SCORE_THRESOLD)
- return lang;
-
- else return null;
+ return lang;
}
/**
@@ -313,42 +414,17 @@
public String identify(InputStream is) throws IOException {
StringBuffer text = new StringBuffer();
- byte buffer[] = new byte[2000];
+ byte[] buffer = new byte[2048];
int len = 0;
- while ((len = is.read(buffer)) != -1) {
+ while (((len = is.read(buffer)) != -1) &&
+ ((analyzeLength == 0) || (text.length() < analyzeLength))) {
+ if (analyzeLength != 0) {
+ len = Math.min(len, analyzeLength - text.length());
+ }
text.append(new String(buffer, 0, len));
}
-
- return identify(text.toString());
- }
-
- public Document filter(Document doc, Parse parse, FetcherOutput fo) throws IndexingException {
-
- //check if X-meta-lang found, possibly put there by HTMLLanguageParser
- String lang = parse.getData().get(HTMLLanguageParser.META_LANG_NAME);
-
- //check if HTTP-header tels us the language
- if (lang == null) lang = parse.getData().get("Content-Language");
-
- if (lang == null) {
- StringBuffer text = new StringBuffer();
- /*
- * String[] anchors = fo.getAnchors(); for (int i = 0; i < anchors.length;
- * i++) { text+=anchors[i] + " "; }
- */
- text.append(parse.getData().getTitle()).append(" ");
- text.append(parse.getText());
- lang = LanguageIdentifier.getInstance().identify(text);
- }
-
- if (lang == null) {
- lang = "unknown";
- }
-
- doc.add(Field.Keyword("lang", lang));
-
- return doc;
+ return identify(text);
}
}
Modified: lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/NGramProfile.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/NGramProfile.java?rev=209663&r1=209662&r2=209663&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/NGramProfile.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/NGramProfile.java Thu Jul 7 15:18:08 2005
@@ -13,29 +13,34 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
package org.apache.nutch.analysis.lang;
-import java.io.BufferedInputStream;
-import java.io.BufferedReader;
+// JDK imports
import java.io.File;
+import java.io.InputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
import java.io.InputStreamReader;
-import java.io.OutputStream;
+import java.io.BufferedInputStream;
import java.util.Date;
-import java.util.Collections;
-import java.util.Hashtable;
+import java.util.List;
import java.util.Iterator;
-import java.util.Vector;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
import java.util.logging.Logger;
+// Nutch imports
import org.apache.nutch.util.LogFormatter;
+// Lucene imports
import org.apache.lucene.analysis.Token;
+
/**
* This class runs a ngram analysis over submitted text, results might be used
* for automatic language identifiaction.
@@ -45,257 +50,235 @@
* Methods are provided to build new NGramProfiles profiles.
*
* @author Sami Siren
+ * @author Jerome Charron - http://frutch.free.fr/
*/
public class NGramProfile {
public static final Logger LOG = LogFormatter
.getLogger("org.apache.nutch.analysis.lang.NGramProfile");
- private String name;
+ /** The minimum length allowed for a ngram. */
+ final static int ABSOLUTE_MIN_NGRAM_LENGTH = 1;
- private Vector sorted = null;
+ /** The maximum length allowed for a ngram. */
+ final static int ABSOLUTE_MAX_NGRAM_LENGTH = 4;
+
+ /** The default min length of ngram */
+ final static int DEFAULT_MIN_NGRAM_LENGTH = 3;
- private StringBuffer tokensb = new StringBuffer();
+ /** The default max length of ngram */
+ final static int DEFAULT_MAX_NGRAM_LENGTH = 3;
- private int min_ngram_length = DEFAULT_MIN_NGRAM_LENGTH;
+ /** The ngram profile file extension */
+ static final String FILE_EXTENSION = "ngp";
- private int max_ngram_length = DEFAULT_MAX_NGRAM_LENGTH;
+ /** The profile max size (number of ngrams of the same size) */
+ static final int MAX_SIZE = 1000;
- private int ngramcount = 0;
+ /** separator char */
+ static final char SEPARATOR = '_';
+ /** The String form of the separator char */
+ private final static String SEP_CHARSEQ = new String(new char[] { SEPARATOR });
- static final String NGRAM_FILE_EXTENSION = "ngp";
+
+ /** The profile's name */
+ private String name = null;
- static final int NGRAM_LENGTH = 1000;
+ /** The NGrams of this profile sorted on the number of occurences */
+ private List sorted = null;
- //separator char
- static final char SEPARATOR = '_';
+ /** The min length of ngram */
+ private int minLength = DEFAULT_MIN_NGRAM_LENGTH;
- //default min length of ngram
- static final int DEFAULT_MIN_NGRAM_LENGTH = 1;
+ /** The max length of ngram */
+ private int maxLength = DEFAULT_MAX_NGRAM_LENGTH;
- //default max length of ngram
- static final int DEFAULT_MAX_NGRAM_LENGTH = 4;
+ /** The total number of ngrams occurences */
+ private int[] ngramcounts = null;
- //table to store ngrams
- Hashtable ngrams = null;
+ /** An index of the ngrams of the profile */
+ private Map ngrams = null;
+ /** A StringBuffer used during analysis */
+ private QuickStringBuffer word = new QuickStringBuffer();
+
+
/**
- * private class used to store NGramEntry
+ * Construct a new ngram profile
+ *
+ * @param name is the name of the profile
+ * @param minlen is the min length of ngram sequences
+ * @param maxlen is the max length of ngram sequences
*/
- class NGramEntry implements Comparable {
- private CharSequence seq;
-
- private int count;
-
- private float normalized_count;
-
- public NGramEntry(CharSequence seq) {
- this.seq = seq;
- }
-
- /**
- * @param ngramsequence
- * @param ngramcount
- */
- public NGramEntry(String ngramsequence, int ngramcount) {
- seq = new StringBuffer(ngramsequence).subSequence(0, ngramsequence
- .length());
- this.count = ngramcount;
- }
-
- public int getCount() {
- return count;
- }
-
- public CharSequence getSeq() {
- return seq;
- }
-
- public int compareTo(Object o) {
- if (((NGramEntry) o).count - count != 0)
- return ((NGramEntry) o).count - count;
- else
- return (seq.toString().compareTo(((NGramEntry) o).seq.toString()));
- }
-
- public void inc() {
- count++;
- }
+ public NGramProfile(String name, int minlen, int maxlen) {
+ // TODO: Compute the initial capacity using minlen and maxlen.
+ this.ngrams = new HashMap(4000);
+ this.minLength = minlen;
+ this.maxLength = maxlen;
+ this.name = name;
}
/**
- * Construct a new ngram profile
+ * @return Returns the name.
+ */
+ public String getName() {
+ return name;
+ }
+
+ /**
+ * Add ngrams from a token to this profile
*
- * @param name
- * Name of profile
+ * @param t is the Token to be added
*/
- public NGramProfile(String name) {
- this(name, DEFAULT_MIN_NGRAM_LENGTH, DEFAULT_MAX_NGRAM_LENGTH);
+ public void add(Token t) {
+ add(new StringBuffer().append(SEPARATOR)
+ .append(t.termText())
+ .append(SEPARATOR));
}
/**
- * Construct a new ngram profile
+ * Add ngrams from a single word to this profile
*
- * @param name
- * Name of profile
- * @param minlen
- * min length of ngram sequences
- * @param maxlen
- * max length of ngram sequences
+ * @param word is the word to add
*/
- public NGramProfile(String name, int minlen, int maxlen) {
- ngrams = new Hashtable();
- this.max_ngram_length = maxlen;
- this.min_ngram_length = minlen;
- this.name = name;
+ public void add(StringBuffer word) {
+ for (int i=minLength; (i <= maxLength) && (i < word.length()); i++) {
+ add(word, i);
+ }
}
/**
- * Add ngrams from a token to this profile
- *
- * @param t
- * Token to be added
+ * Add the last NGrams from the specified word.
*/
- public void addFromToken(Token t) {
- tokensb.setLength(0);
- tokensb.append(SEPARATOR).append(t.termText()).append(SEPARATOR);
- addNGrams(tokensb);
+ private void add(QuickStringBuffer word) {
+ int wlen = word.length();
+ if (wlen >= minLength) {
+ int max = Math.min(maxLength, wlen);
+ for (int i=minLength; i<=max; i++) {
+ add(word.subSequence(wlen-i, wlen));
+ }
+ }
+ }
+
+ /**
+ * Add ngrams from a single word in this profile
+ *
+ * @param word is the word to add
+ * @param n is the ngram size
+ */
+ private void add(CharSequence cs) {
+
+ if (cs.equals(SEP_CHARSEQ)) { return; }
+ NGramEntry nge = (NGramEntry) ngrams.get(cs);
+ if (nge == null) {
+ nge = new NGramEntry(cs);
+ ngrams.put(cs, nge);
+ }
+ nge.inc();
}
/**
* Analyze a piece of text
*
- * @param text
- * the text to be analyzed
+ * @param text the text to be analyzed
*/
public void analyze(StringBuffer text) {
- StringBuffer word;
- int i;
if (ngrams != null) {
ngrams.clear();
+ sorted = null;
}
- word = new StringBuffer().append(SEPARATOR);
- for (i = 0; i < text.length(); i++) {
+ word.clear().append(SEPARATOR);
+ for (int i = 0; i < text.length(); i++) {
char c = Character.toLowerCase(text.charAt(i));
if (Character.isLetter(c)) {
- word.append(c);
+ add(word.append(c));
} else {
//found word boundary
if (word.length() > 1) {
//we have a word!
- word.append(SEPARATOR);
- addNGrams(word);
- word.delete(0, word.length());
+ add(word.append(SEPARATOR));
+ word.clear().append(SEPARATOR);
}
}
}
if (word.length() > 1) {
- //we have a last word
- word.append(SEPARATOR);
- addNGrams(word);
+ //we have a word!
+ add(word.append(SEPARATOR));
}
normalize();
}
/**
- * Normalize profile
- */
- protected void normalize() {
- Vector sorted = getSorted();
- int sum = 0;
-
- //only calculate ngramcount if it was not available in profile
- if (ngramcount == 0) {
- for (int i = 0; i < sorted.size(); i++) {
- ngramcount += ((NGramEntry) sorted.get(i)).count;
- }
- }
-
- if (sorted.size() > 0) {
- Iterator i = sorted.iterator();
-
- while (i.hasNext()) {
- NGramEntry e = (NGramEntry) i.next();
- e.normalized_count = e.count / (float)ngramcount;
- }
- }
- }
-
- /**
- * Add ngrams from a single word to this profile
- *
* @param word
+ * @param n sequence length
*/
- public void addNGrams(StringBuffer word) {
- int i;
-
- for (i = min_ngram_length; i <= max_ngram_length && i < word.length(); i++) {
- addNGrams(word, i);
+ private void add(StringBuffer word, int n) {
+ for (int i=0; i <= word.length()-n; i++) {
+ add(word.subSequence(i, i + n));
}
}
-
+
/**
- * @param word
- * @param n
- * sequence length
+ * Normalize the profile (calculates the ngrams frequencies)
*/
- private void addNGrams(StringBuffer word, int n) {
- NGramEntry nge;
- StringBuffer sb;
- int i;
-
- for (i = 0; i <= word.length() - n; i++) {
-
- CharSequence cs = word.subSequence(i, i + n);
+ protected void normalize() {
- if (ngrams.containsKey(cs)) {
- nge = (NGramEntry) ngrams.get(cs);
- } else {
- nge = new NGramEntry(cs);
+ NGramEntry e = null;
+ //List sorted = getSorted();
+ Iterator i = ngrams.values().iterator();
+
+ // Calculate ngramcount if not already done
+ if (ngramcounts == null) {
+ ngramcounts = new int[maxLength+1];
+ while (i.hasNext()) {
+ e = (NGramEntry) i.next();
+ ngramcounts[e.size()] += e.count;
}
- nge.inc();
- ngrams.put(cs, nge);
+ }
+
+ i = ngrams.values().iterator();
+ while (i.hasNext()) {
+ e = (NGramEntry) i.next();
+ e.frequency = (float) e.count / (float) ngramcounts[e.size()];
}
}
/**
- * Return sorted vector of ngrams (sort done by 1. count 2. sequence)
+ * Return a sorted list of ngrams (sort done by 1. frequency 2. sequence)
*
* @return sorted vector of ngrams
*/
- public Vector getSorted() {
- //make sure srting is done only once
+ public List getSorted() {
+ // make sure sorting is done only once
if (sorted == null) {
- sorted = new Vector(ngrams.values());
+ sorted = new ArrayList(ngrams.values());
Collections.sort(sorted);
- //trim at NGRAM_LENGTH entries
- if (sorted.size() > NGRAM_LENGTH)
- sorted.setSize(NGRAM_LENGTH);
+ // trim at NGRAM_LENGTH entries
+ if (sorted.size() > MAX_SIZE) {
+ sorted = sorted.subList(0, MAX_SIZE);
+ }
}
-
return sorted;
}
-
- /**
- * Return ngramprofile as text
- *
- * @return ngramprofile as text
- */
+
+ // Inherited JavaDoc
public String toString() {
- StringBuffer s = new StringBuffer();
+
+ StringBuffer s = new StringBuffer().append("NGramProfile: ")
+ .append(name).append("\n");
Iterator i = getSorted().iterator();
- s.append("NGramProfile: ").append(name).append("\n");
while (i.hasNext()) {
NGramEntry entry = (NGramEntry) i.next();
- s.append(entry.count).append(':').append(entry.seq).append(" ").append(
- entry.normalized_count).append("\n");
+ s.append("[").append(entry.seq)
+ .append("/").append(entry.count)
+ .append("/").append(entry.frequency).append("]\n");
}
return s.toString();
}
@@ -308,6 +291,7 @@
* @return similarity 0=exact match
*/
public float getSimilarity(NGramProfile another) {
+
float sum = 0;
try {
@@ -315,21 +299,20 @@
while (i.hasNext()) {
NGramEntry other = (NGramEntry) i.next();
if (ngrams.containsKey(other.seq)) {
- sum += Math.abs((other.normalized_count - ((NGramEntry) ngrams
- .get(other.seq)).normalized_count)) / 2;
+ sum += Math.abs((other.frequency -
+ ((NGramEntry) ngrams.get(other.seq)).frequency)) / 2;
} else {
- sum += other.normalized_count;
+ sum += other.frequency;
}
}
i = getSorted().iterator();
while (i.hasNext()) {
NGramEntry other = (NGramEntry) i.next();
if (another.ngrams.containsKey(other.seq)) {
- sum += Math
- .abs((other.normalized_count - ((NGramEntry) another.ngrams
- .get(other.seq)).normalized_count)) / 2;
+ sum += Math.abs((other.frequency -
+ ((NGramEntry) another.ngrams.get(other.seq)).frequency)) / 2;
} else {
- sum += other.normalized_count;
+ sum += other.frequency;
}
}
} catch (Exception e) {
@@ -339,27 +322,29 @@
}
/**
- * Loads a ngram profile from InputStream (assumes UTF-8 encoded content)
+ * Loads a ngram profile from an InputStream
+ * (assumes UTF-8 encoded content)
+ * @param is the InputStream to read
*/
public void load(InputStream is) throws IOException {
- BufferedReader bis = new BufferedReader(new InputStreamReader(is, "UTF-8"));
- String line;
ngrams.clear();
+ ngramcounts = new int[maxLength+1];
+ BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"));
+ String line = null;
- while ((line = bis.readLine()) != null) {
+ while ((line = reader.readLine()) != null) {
// # starts a comment line
if (line.charAt(0) != '#') {
int spacepos = line.indexOf(' ');
String ngramsequence = line.substring(0, spacepos).trim();
- int ngramcount = Integer.parseInt(line.substring(spacepos + 1));
-
- if (!line.startsWith("ngram_count")) {
- NGramEntry en = new NGramEntry(ngramsequence, ngramcount);
- ngrams.put(en.getSeq(), en);
- } else {
- this.ngramcount = ngramcount;
+ int len = ngramsequence.length();
+ if ((len >= minLength) && (len <= maxLength)) {
+ int ngramcount = Integer.parseInt(line.substring(spacepos + 1));
+ NGramEntry en = new NGramEntry(ngramsequence, ngramcount);
+ ngrams.put(en.getSeq(), en);
+ ngramcounts[len] += ngramcount;
}
}
}
@@ -369,16 +354,14 @@
/**
* Create a new Language profile from (preferably quite large) text file
*
- * @param name
- * name of profile
- * @param is
- * @param encoding
- * encoding of stream
+ * @param name is thename of profile
+ * @param is is the stream to read
+ * @param encoding is the encoding of stream
*/
- public static NGramProfile createNgramProfile(String name, InputStream is,
- String encoding) {
+ public static NGramProfile create(String name, InputStream is, String encoding) {
- NGramProfile newProfile = new NGramProfile(name);
+ NGramProfile newProfile = new NGramProfile(name, ABSOLUTE_MIN_NGRAM_LENGTH,
+ ABSOLUTE_MAX_NGRAM_LENGTH);
BufferedInputStream bis = new BufferedInputStream(is);
byte buffer[] = new byte[4096];
@@ -394,7 +377,6 @@
}
newProfile.analyze(text);
-
return newProfile;
}
@@ -402,25 +384,42 @@
* Writes NGramProfile content into OutputStream, content is outputted with
* UTF-8 encoding
*
- * @param os
- * Stream to output to
+ * @param os the Stream to output to
* @throws IOException
*/
-
public void save(OutputStream os) throws IOException {
- Vector v = getSorted();
- Iterator i = v.iterator();
- os
- .write(("# NgramProfile generated at " + new Date() + " for Nutch Language Identification\n")
- .getBytes());
- os.write(("ngram_count " + ngramcount + "\n").getBytes());
- while (i.hasNext()) {
- NGramEntry e = (NGramEntry) i.next();
- String line = e.getSeq().toString() + " " + e.getCount() + "\n";
+ // Write header
+ os.write(("# NgramProfile generated at " + new Date() +
+ " for Nutch Language Identification\n").getBytes());
+
+ // And then each ngram
+
+ // First dispatch ngrams in many lists depending on their size
+ // (one list for each size, in order to store MAX_SIZE ngrams for each
+ // size of ngram)
+ int count = 0;
+ List list = new ArrayList();
+ List sublist = new ArrayList();
+ NGramEntry[] entries = (NGramEntry[]) ngrams.values().toArray(new NGramEntry[ngrams.size()]);
+ for (int i=minLength; i<=maxLength; i++) {
+ for (int j=0; j<entries.length; j++) {
+ if (entries[j].getSeq().length() == i) {
+ sublist.add(entries[j]);
+ }
+ }
+ Collections.sort(sublist);
+ if (sublist.size() > MAX_SIZE) {
+ sublist = sublist.subList(0, MAX_SIZE);
+ }
+ list.addAll(sublist);
+ sublist.clear();
+ }
+ for (int i=0; i<list.size(); i++) {
+ NGramEntry e = (NGramEntry) list.get(i);
+ String line = e.toString() + " " + e.getCount() + "\n";
os.write(line.getBytes("UTF-8"));
}
-
os.flush();
}
@@ -431,7 +430,10 @@
*/
public static void main(String args[]) {
- String usage = "Usage: NGramProfile [-create profilename filename encoding] [-similarity file1 file2] [-score profile-name filename encoding]";
+ String usage = "Usage: NGramProfile " +
+ "[-create profilename filename encoding] " +
+ "[-similarity file1 file2] "+
+ "[-score profile-name filename encoding]";
int command = 0;
final int CREATE = 1;
@@ -442,7 +444,7 @@
String filename = "";
String filename2 = "";
String encoding = "";
-
+
if (args.length == 0) {
System.err.println(usage);
System.exit(-1);
@@ -479,43 +481,40 @@
File f = new File(filename);
FileInputStream fis = new FileInputStream(f);
- NGramProfile newProfile = NGramProfile.createNgramProfile(profilename,
- fis, encoding);
+ NGramProfile newProfile = NGramProfile.create(profilename, fis, encoding);
fis.close();
- f = new File(profilename + "." + NGRAM_FILE_EXTENSION);
+ f = new File(profilename + "." + FILE_EXTENSION);
FileOutputStream fos = new FileOutputStream(f);
newProfile.save(fos);
- System.out.println("new profile " + profilename + "."
- + NGRAM_FILE_EXTENSION + " was created.");
+ System.out.println("new profile " + profilename + "." + FILE_EXTENSION + " was created.");
break;
case SIMILARITY:
f = new File(filename);
fis = new FileInputStream(f);
- newProfile = NGramProfile.createNgramProfile(filename, fis, encoding);
+ newProfile = NGramProfile.create(filename, fis, encoding);
newProfile.normalize();
f = new File(filename2);
fis = new FileInputStream(f);
- NGramProfile newProfile2 = NGramProfile.createNgramProfile(filename2,
- fis, encoding);
+ NGramProfile newProfile2 = NGramProfile.create(filename2, fis, encoding);
newProfile2.normalize();
- System.out.println("Similarity is "
- + newProfile.getSimilarity(newProfile2));
+ System.out.println("Similarity is " + newProfile.getSimilarity(newProfile2));
break;
case SCORE:
f = new File(filename);
fis = new FileInputStream(f);
- newProfile = NGramProfile.createNgramProfile(filename, fis, encoding);
+ newProfile = NGramProfile.create(filename, fis, encoding);
- f = new File(profilename + "." + NGRAM_FILE_EXTENSION);
+ f = new File(profilename + "." + FILE_EXTENSION);
fis = new FileInputStream(f);
- NGramProfile compare = new NGramProfile(profilename);
+ NGramProfile compare = new NGramProfile(profilename,
+ DEFAULT_MIN_NGRAM_LENGTH,
+ DEFAULT_MAX_NGRAM_LENGTH);
compare.load(fis);
System.out.println("Score is " + compare.getSimilarity(newProfile));
-
break;
}
@@ -525,18 +524,217 @@
}
}
+
/**
- * @return Returns the name.
+ * Inner class that describes a NGram
*/
- public String getName() {
- return name;
+ class NGramEntry implements Comparable {
+
+ /** The NGRamProfile this NGram is related to */
+ private NGramProfile profile = null;
+
+ /** The sequence of characters of the ngram */
+ CharSequence seq = null;
+
+ /** The number of occurences of this ngram in its profile */
+ private int count = 0;
+
+ /** The frequency of this ngram in its profile */
+ private float frequency = 0.0F;
+
+
+ /**
+ * Constructs a new NGramEntry
+ * @param seq is the sequence of characters of the ngram
+ */
+ public NGramEntry(CharSequence seq) {
+ this.seq = seq;
+ }
+
+ /**
+ * Constructs a new NGramEntry
+ * @param seq is the sequence of characters of the ngram
+ * @param count is the number of occurences of this ngram
+ */
+ public NGramEntry(String seq, int count) {
+ this.seq = new StringBuffer(seq).subSequence(0, seq.length());
+ this.count = count;
+ }
+
+
+ /**
+ * Returns the number of occurences of this ngram in its profile
+ * @return the number of occurences of this ngram in its profile
+ */
+ public int getCount() {
+ return count;
+ }
+
+ /**
+ * Returns the frequency of this ngram in its profile
+ * @return the frequency of this ngram in its profile
+ */
+ public float getFrequency() {
+ return frequency;
+ }
+
+ /**
+ * Returns the sequence of characters of this ngram
+ * @return the sequence of characters of this ngram
+ */
+ public CharSequence getSeq() {
+ return seq;
+ }
+
+ /**
+ * Returns the size of this ngram
+ * @return the size of this ngram
+ */
+ public int size() {
+ return seq.length();
+ }
+
+ // Inherited JavaDoc
+ public int compareTo(Object o) {
+ NGramEntry ngram = (NGramEntry) o;
+ int diff = Float.compare(ngram.getFrequency(), frequency);
+ if (diff != 0) {
+ return diff;
+ } else {
+ return (toString().compareTo(ngram.toString()));
+ }
+ }
+
+ /**
+ * Increments the number of occurences of this ngram.
+ */
+ public void inc() {
+ count++;
+ }
+
+ /**
+ * Associated a profile to this ngram
+ * @param profile is the profile associated to this ngram
+ */
+ public void setProfile(NGramProfile profile) {
+ this.profile = profile;
+ }
+
+ /**
+ * Returns the profile associated to this ngram
+ * @return the profile associated to this ngram
+ */
+ public NGramProfile getProfile() {
+ return profile;
+ }
+
+ // Inherited JavaDoc
+ public String toString() {
+ return seq.toString();
+ }
+
+ // Inherited JavaDoc
+ public int hashCode() {
+ return seq.hashCode();
+ }
+
+ // Inherited JavaDoc
+ public boolean equals(Object obj) {
+
+ NGramEntry ngram = null;
+ try {
+ ngram = (NGramEntry) obj;
+ return ngram.seq.equals(seq);
+ } catch (Exception e) {
+ return false;
+ }
+ }
+
}
- /**
- * @param name
- * The name to set.
- */
- public void setName(String name) {
- this.name = name;
+
+ private class QuickStringBuffer implements CharSequence {
+
+ private char value[];
+
+ private int count;
+
+ QuickStringBuffer() {
+ this(16);
+ }
+
+ QuickStringBuffer(char[] value) {
+ this.value = value;
+ count = value.length;
+ }
+
+ QuickStringBuffer(int length) {
+ value = new char[length];
+ }
+
+ QuickStringBuffer(String str) {
+ this(str.length() + 16);
+ append(str);
+ }
+
+ public int length() {
+ return count;
+ }
+
+ private void expandCapacity(int minimumCapacity) {
+ int newCapacity = (value.length + 1) * 2;
+ if (newCapacity < 0) {
+ newCapacity = Integer.MAX_VALUE;
+ } else if (minimumCapacity > newCapacity) {
+ newCapacity = minimumCapacity;
+ }
+
+ char newValue[] = new char[newCapacity];
+ System.arraycopy(value, 0, newValue, 0, count);
+ value = newValue;
+ }
+
+ QuickStringBuffer clear() {
+ count = 0;
+ return this;
+ }
+
+ public char charAt(int index) {
+ return value[index];
+ }
+
+ QuickStringBuffer append(String str) {
+ if (str == null) {
+ str = String.valueOf(str);
+ }
+
+ int len = str.length();
+ int newcount = count + len;
+ if (newcount > value.length) {
+ expandCapacity(newcount);
+ }
+ str.getChars(0, len, value, count);
+ count = newcount;
+ return this;
+ }
+
+ QuickStringBuffer append(char c) {
+ int newcount = count + 1;
+ if (newcount > value.length) {
+ expandCapacity(newcount);
+ }
+ value[count++] = c;
+ return this;
+ }
+
+ public CharSequence subSequence(int start, int end) {
+ return new String(value, start, end - start);
+ }
+
+ public String toString() {
+ return new String(this.value);
+ }
}
+
+
}