You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@roller.apache.org by ga...@apache.org on 2005/12/18 18:47:19 UTC

svn commit: r357482 - in /incubator/roller/trunk: src/org/roller/presentation/filters/RefererFilter.java web/WEB-INF/classes/roller.properties

Author: gangolli
Date: Sun Dec 18 09:47:16 2005
New Revision: 357482

URL: http://svn.apache.org/viewcvs?rev=357482&view=rev
Log:
For ROL-934.  Support robot user-agent check in RefererFilter.  The property to configure this has been added to roller.properties with the setting commented out.  To enable, uncomment it and edit the pattern if/as desired.  

Modified:
    incubator/roller/trunk/src/org/roller/presentation/filters/RefererFilter.java
    incubator/roller/trunk/web/WEB-INF/classes/roller.properties

Modified: incubator/roller/trunk/src/org/roller/presentation/filters/RefererFilter.java
URL: http://svn.apache.org/viewcvs/incubator/roller/trunk/src/org/roller/presentation/filters/RefererFilter.java?rev=357482&r1=357481&r2=357482&view=diff
==============================================================================
--- incubator/roller/trunk/src/org/roller/presentation/filters/RefererFilter.java (original)
+++ incubator/roller/trunk/src/org/roller/presentation/filters/RefererFilter.java Sun Dec 18 09:47:16 2005
@@ -1,6 +1,8 @@
 package org.roller.presentation.filters;
 
 import java.io.IOException;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
 
 import javax.servlet.Filter;
 import javax.servlet.FilterChain;
@@ -17,95 +19,107 @@
 import org.roller.model.RollerFactory;
 import org.roller.presentation.RollerContext;
 import org.roller.presentation.RollerRequest;
+import org.roller.config.RollerConfig;
 
 /**
  * Keep track of referers.
  *
- * @web.filter name="RefererFilter"
- *
  * @author David M. Johnson
+ * @web.filter name="RefererFilter"
  */
 public class RefererFilter implements Filter {
-    private FilterConfig   mFilterConfig = null;
-    private static Log mLogger =
-            LogFactory.getFactory().getInstance(RefererFilter.class);
-    
+    private FilterConfig mFilterConfig = null;
+    private static Log mLogger = LogFactory.getFactory().getInstance(RefererFilter.class);
+    private static Pattern robotPattern = null;
+    private static final String ROBOT_PATTERN_PROP_NAME = "referrer.robotCheck.userAgentPattern";
+
     /**
      * destroy
      */
     public void destroy() {
     }
-    
+
     /**
      * doFilter
      */
-    public void doFilter(
-            ServletRequest req, ServletResponse res, FilterChain chain)
-            throws IOException, ServletException {
-        HttpServletRequest request = (HttpServletRequest)req;
+    public void doFilter(ServletRequest req, ServletResponse res, FilterChain chain) throws IOException, ServletException {
+        HttpServletRequest request = (HttpServletRequest) req;
         boolean isRefSpammer = false;
+        boolean isRobot = false;
+
         try {
-            RollerRequest rreq = RollerRequest.getRollerRequest(request);
-            RollerContext rctx = RollerContext.getRollerContext(
-                    mFilterConfig.getServletContext());
-            
-            if (rreq!=null && rreq.getWebsite() != null) {
-                String handle = rreq.getWebsite().getHandle();
-                
-                // Base page URLs, with and without www.
-                String basePageUrlWWW =
-                        rctx.getAbsoluteContextUrl(request)+"/page/"+handle;
-                String basePageUrl = basePageUrlWWW;
-                if ( basePageUrlWWW.startsWith("http://www.") ) {
-                    // chop off the http://www.
-                    basePageUrl = "http://"+basePageUrlWWW.substring(11);
-                }
-                
-                // Base comment URLs, with and without www.
-                String baseCommentsUrlWWW =
-                        rctx.getAbsoluteContextUrl(request)+"/comments/"+handle;
-                String baseCommentsUrl = baseCommentsUrlWWW;
-                if ( baseCommentsUrlWWW.startsWith("http://www.") ) {
-                    // chop off the http://www.
-                    baseCommentsUrl= "http://"+baseCommentsUrlWWW.substring(11);
-                }
-                
-                // Don't process hits from same user's blogs as referers by
-                // ignoring Don't process referer from pages that start with base URLs.
-                String referer = request.getHeader("Referer");
-                if (  referer==null ||
-                        (
-                        !referer.startsWith( basePageUrl )
-                        && !referer.startsWith( basePageUrlWWW )
-                        && !referer.startsWith( baseCommentsUrl )
-                        && !referer.startsWith( baseCommentsUrlWWW )
-                        )
-                        ) {
-                    RefererManager refMgr =
-                            RollerFactory.getRoller().getRefererManager();
-                    isRefSpammer = refMgr.processRequest(rreq);
-                } else {
-                    if (mLogger.isDebugEnabled()) {
-                        mLogger.debug("Ignoring referer="+referer);
+            if (robotPattern != null) {
+                // If the pattern is present, we check for whether the User-Agent matches,
+                // and set isRobot if so.  Currently, all referral processing, including
+                // spam check, is skipped for robots identified in this way.
+                String userAgent = request.getHeader("User-Agent");
+                isRobot = (userAgent != null && userAgent.length() > 0 && robotPattern.matcher(userAgent).matches());
+            }
+
+            if (!isRobot) {
+                RollerRequest rreq = RollerRequest.getRollerRequest(request);
+                RollerContext rctx = RollerContext.getRollerContext(mFilterConfig.getServletContext());
+
+                if (rreq != null && rreq.getWebsite() != null) {
+                    String handle = rreq.getWebsite().getHandle();
+
+                    // Base page URLs, with and without www.
+                    String basePageUrlWWW = rctx.getAbsoluteContextUrl(request) + "/page/" + handle;
+                    String basePageUrl = basePageUrlWWW;
+                    if (basePageUrlWWW.startsWith("http://www.")) {
+                        // chop off the http://www.
+                        basePageUrl = "http://" + basePageUrlWWW.substring(11);
+                    }
+
+                    // Base comment URLs, with and without www.
+                    String baseCommentsUrlWWW = rctx.getAbsoluteContextUrl(request) + "/comments/" + handle;
+                    String baseCommentsUrl = baseCommentsUrlWWW;
+                    if (baseCommentsUrlWWW.startsWith("http://www.")) {
+                        // chop off the http://www.
+                        baseCommentsUrl = "http://" + baseCommentsUrlWWW.substring(11);
+                    }
+
+                    // Don't process hits from same user's blogs as referers by
+                    // ignoring Don't process referer from pages that start with base URLs.
+                    String referer = request.getHeader("Referer");
+                    if (referer == null || (!referer.startsWith(basePageUrl) && !referer.startsWith(basePageUrlWWW) && !referer.startsWith(baseCommentsUrl) && !referer.startsWith(baseCommentsUrlWWW)))
+                    {
+                        RefererManager refMgr = RollerFactory.getRoller().getRefererManager();
+                        isRefSpammer = refMgr.processRequest(rreq);
+                    } else {
+                        if (mLogger.isDebugEnabled()) {
+                            mLogger.debug("Ignoring referer=" + referer);
+                        }
                     }
                 }
             }
         } catch (Exception e) {
-            mLogger.error("Processing referer",e);
+            mLogger.error("Processing referer", e);
         }
-        
+
         if (isRefSpammer) {
-            HttpServletResponse response = (HttpServletResponse)res;
+            HttpServletResponse response = (HttpServletResponse) res;
             response.sendError(HttpServletResponse.SC_FORBIDDEN);
         } else {
             chain.doFilter(req, res);
         }
     }
-    
+
     /**
      * init
      */
     public void init(FilterConfig filterConfig) throws ServletException {
         mFilterConfig = filterConfig;
+        String robotPatternStr = RollerConfig.getProperty(ROBOT_PATTERN_PROP_NAME);
+        if (robotPatternStr != null && robotPatternStr.length() >0) {
+            // Parse the pattern, and store the compiled form.
+            try {
+                robotPattern = Pattern.compile(robotPatternStr);
+            } catch (Exception e) {
+                // Most likely a PatternSyntaxException; log and continue as if it is not set.
+                mLogger.error("Error parsing "+ ROBOT_PATTERN_PROP_NAME + " value '" +
+                        robotPatternStr + "'.  Robots will not be filtered. ", e);
+            }
+        }
     }
 }

Modified: incubator/roller/trunk/web/WEB-INF/classes/roller.properties
URL: http://svn.apache.org/viewcvs/incubator/roller/trunk/web/WEB-INF/classes/roller.properties?rev=357482&r1=357481&r2=357482&view=diff
==============================================================================
--- incubator/roller/trunk/web/WEB-INF/classes/roller.properties (original)
+++ incubator/roller/trunk/web/WEB-INF/classes/roller.properties Sun Dec 18 09:47:16 2005
@@ -207,6 +207,14 @@
 # trackback.allowedURLs=http://w3.ibm.com/.*||http://another.example.com/.*
 trackback.allowedURLs=
 
+#Robot check in referral processing.  If this pattern is set and the User-Agent in the
+#request matches this pattern, all referral processing is skipped; this means that
+#the referral spam check is skipped, the request is allowed to proceed, but the
+#referrer is not recorded and hit count is not incremented.  Recommended for large sites
+#that get a lot of legitimate crawler bot traffic.  The pattern here is a suggestion that
+#has been reported to work well.
+#referrer.robotCheck.userAgentPattern=.*(slurp|bot|java).*
+
 #----------------------------------
 # ping settings