You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@whimsical.apache.org by se...@apache.org on 2022/04/29 13:35:29 UTC

[whimsy] branch master updated: Loof for non-ASF resource references

This is an automated email from the ASF dual-hosted git repository.

sebb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/whimsy.git


The following commit(s) were added to refs/heads/master by this push:
     new f23bf389 Loof for non-ASF resource references
f23bf389 is described below

commit f23bf3890b5da9663401196fbf7e69f44c28e1d5
Author: Sebb <se...@apache.org>
AuthorDate: Fri Apr 29 14:35:09 2022 +0100

    Loof for non-ASF resource references
---
 lib/whimsy/sitestandards.rb |  9 ++++++
 tools/asf-site-check.rb     | 75 +++++++++++++++++++++++++++++++++++++++++++++
 tools/site-scan.rb          |  8 +++++
 3 files changed, 92 insertions(+)

diff --git a/lib/whimsy/sitestandards.rb b/lib/whimsy/sitestandards.rb
index 769b5c82..b5fc1254 100644
--- a/lib/whimsy/sitestandards.rb
+++ b/lib/whimsy/sitestandards.rb
@@ -119,6 +119,15 @@ module SiteStandards
       CHECK_DOC => 'All websites must link to the Privacy Policy.',
     },
 
+    'resources' => { # Custom: resources not outside ASF
+      CHECK_TEXT => nil,
+      CHECK_CAPTURE => nil,
+      CHECK_VALIDATE => %r{.}i,
+      CHECK_TYPE => true,
+      CHECK_POLICY => 'https://privacy.apache.org/faq/committers.html',
+      CHECK_DOC => 'Websites must not link to externally hosted resources',
+    },
+
     'image' => { # Custom: merely looks in IMAGE_DIR for #{id}.*
       CHECK_TEXT => nil,
       CHECK_CAPTURE => nil,
diff --git a/tools/asf-site-check.rb b/tools/asf-site-check.rb
new file mode 100644
index 00000000..ee1378cb
--- /dev/null
+++ b/tools/asf-site-check.rb
@@ -0,0 +1,75 @@
+#!/usr/bin/env ruby
+
+# Determines if a host name is controlled by the ASF
+
+# TODO: derive from the list at: 
+# https://raw.githubusercontent.com/apache/privacy-website/main/policies/asf-domains.md
+
+module ASFDOMAIN
+  ASF_DOMAINS = %w{
+    any23.com
+    any23.org
+    apache-extras.org
+    apache.org
+    apachecon.com
+    apachecon.org
+    apacheextras.org
+    apachextras.org
+    cloudstack.com
+    cloudstack.org
+    codehaus.org
+    couchapp.com
+    couchapp.org
+    couchhack.org
+    deltaspike.org
+    feathercast.org
+    freemarker.org
+    gremlint.com
+    groovy-lang.org
+    ignite.run
+    jclouds.com
+    jclouds.net
+    jclouds.org
+    jspwiki.org
+    libcloud.com
+    libcloud.net
+    libcloud.org
+    modssl.com
+    modssl.net
+    myfaces.org
+    netbeans.org
+    ofbiz.org
+    openoffice.org
+    openwhisk.com
+    openwhisk.net
+    openwhisk.org
+    projectgeode.org
+    qi4j.org
+    spamassassin.org
+    subversion.com
+    subversion.net
+    subversion.org
+    tinkerpop.com
+  }
+  # Check if a host name is known to be under ASF control
+  def self.asfhost?(host)
+    return true if ASF_DOMAINS.include? host
+    # This assumes all ASF domains are of the form a.b
+    return host =~ %r{\.(\w+\.\w+)\z} && ASF_DOMAINS.include?($1)
+  end
+  # check if URL is known to be under ASF control
+  # extracts hostname and calls asfhost?
+  def self.asfurl?(url)
+    if url =~ %r{\Ahttps?://(.+?)(/|\z)}i
+      return asfhost?($1)
+    else
+      return true # a relative link
+    end
+  end
+end
+
+if __FILE__ == $0
+  ARGV.each do |arg|
+    p [arg, ASFDOMAIN.asfhost?(arg), ASFDOMAIN.asfurl?(arg)]
+  end
+end
diff --git a/tools/site-scan.rb b/tools/site-scan.rb
index 7e76cd51..1e6a766f 100755
--- a/tools/site-scan.rb
+++ b/tools/site-scan.rb
@@ -13,6 +13,7 @@ require 'json'
 require 'whimsy/asf'
 require 'whimsy/cache'
 require 'whimsy/sitestandards'
+require_relative 'asf-site-check'
 
 # Normalize spaces in text runs
 def squash(text)
@@ -146,6 +147,13 @@ def parse(id, site, name)
   # THIRD: see if an image has been uploaded
   data[:image] = ASF::SiteImage.find(id)
 
+  # Check for resource loading from non-ASF domains
+  js_urls  = doc.xpath('//script/@src').map(&:content).reject {|x| ASFDOMAIN.asfurl? x}
+  css_urls = doc.xpath('//link/@href').map(&:content).reject {|x| ASFDOMAIN.asfurl? x}
+  img_urls = doc.xpath('//img/@src').map(&:content).reject {|x| ASFDOMAIN.asfurl? x}
+  resources = js_urls.size + css_urls.size + img_urls.size
+  data[:resources] = 'Found no external resources' if resources == 0
+
   return data
 end