You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2016/01/22 22:28:21 UTC
svn commit: r1726318 - /nutch/trunk/conf/regex-normalize.xml.template
Author: snagel
Date: Fri Jan 22 21:28:21 2016
New Revision: 1726318
URL: http://svn.apache.org/viewvc?rev=1726318&view=rev
Log:
NUTCH-2204 : revert erroneous commit
Modified:
nutch/trunk/conf/regex-normalize.xml.template
Modified: nutch/trunk/conf/regex-normalize.xml.template
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/regex-normalize.xml.template?rev=1726318&r1=1726317&r2=1726318&view=diff
==============================================================================
--- nutch/trunk/conf/regex-normalize.xml.template (original)
+++ nutch/trunk/conf/regex-normalize.xml.template Fri Jan 22 21:28:21 2016
@@ -39,12 +39,11 @@
<substitution>/$3</substitution>
</regex> -->
-<!-- removes interpage href anchors such as site.com#location
- (also done by urlnormalizer-basic
+<!-- removes interpage href anchors such as site.com#location -->
<regex>
- <pattern>#.*</pattern>
+ <pattern>#.*?(\?|&|$)</pattern>
<substitution>$1</substitution>
-</regex> -->
+</regex>
<!-- cleans ?&var=value into ?var=value -->
<regex>