You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@whimsical.apache.org by se...@apache.org on 2022/04/29 13:35:29 UTC
[whimsy] branch master updated: Loof for non-ASF resource references
This is an automated email from the ASF dual-hosted git repository.
sebb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/whimsy.git
The following commit(s) were added to refs/heads/master by this push:
new f23bf389 Loof for non-ASF resource references
f23bf389 is described below
commit f23bf3890b5da9663401196fbf7e69f44c28e1d5
Author: Sebb <se...@apache.org>
AuthorDate: Fri Apr 29 14:35:09 2022 +0100
Loof for non-ASF resource references
---
lib/whimsy/sitestandards.rb | 9 ++++++
tools/asf-site-check.rb | 75 +++++++++++++++++++++++++++++++++++++++++++++
tools/site-scan.rb | 8 +++++
3 files changed, 92 insertions(+)
diff --git a/lib/whimsy/sitestandards.rb b/lib/whimsy/sitestandards.rb
index 769b5c82..b5fc1254 100644
--- a/lib/whimsy/sitestandards.rb
+++ b/lib/whimsy/sitestandards.rb
@@ -119,6 +119,15 @@ module SiteStandards
CHECK_DOC => 'All websites must link to the Privacy Policy.',
},
+ 'resources' => { # Custom: resources not outside ASF
+ CHECK_TEXT => nil,
+ CHECK_CAPTURE => nil,
+ CHECK_VALIDATE => %r{.}i,
+ CHECK_TYPE => true,
+ CHECK_POLICY => 'https://privacy.apache.org/faq/committers.html',
+ CHECK_DOC => 'Websites must not link to externally hosted resources',
+ },
+
'image' => { # Custom: merely looks in IMAGE_DIR for #{id}.*
CHECK_TEXT => nil,
CHECK_CAPTURE => nil,
diff --git a/tools/asf-site-check.rb b/tools/asf-site-check.rb
new file mode 100644
index 00000000..ee1378cb
--- /dev/null
+++ b/tools/asf-site-check.rb
@@ -0,0 +1,75 @@
+#!/usr/bin/env ruby
+
+# Determines if a host name is controlled by the ASF
+
+# TODO: derive from the list at:
+# https://raw.githubusercontent.com/apache/privacy-website/main/policies/asf-domains.md
+
+module ASFDOMAIN
+ ASF_DOMAINS = %w{
+ any23.com
+ any23.org
+ apache-extras.org
+ apache.org
+ apachecon.com
+ apachecon.org
+ apacheextras.org
+ apachextras.org
+ cloudstack.com
+ cloudstack.org
+ codehaus.org
+ couchapp.com
+ couchapp.org
+ couchhack.org
+ deltaspike.org
+ feathercast.org
+ freemarker.org
+ gremlint.com
+ groovy-lang.org
+ ignite.run
+ jclouds.com
+ jclouds.net
+ jclouds.org
+ jspwiki.org
+ libcloud.com
+ libcloud.net
+ libcloud.org
+ modssl.com
+ modssl.net
+ myfaces.org
+ netbeans.org
+ ofbiz.org
+ openoffice.org
+ openwhisk.com
+ openwhisk.net
+ openwhisk.org
+ projectgeode.org
+ qi4j.org
+ spamassassin.org
+ subversion.com
+ subversion.net
+ subversion.org
+ tinkerpop.com
+ }
+ # Check if a host name is known to be under ASF control
+ def self.asfhost?(host)
+ return true if ASF_DOMAINS.include? host
+ # This assumes all ASF domains are of the form a.b
+ return host =~ %r{\.(\w+\.\w+)\z} && ASF_DOMAINS.include?($1)
+ end
+ # check if URL is known to be under ASF control
+ # extracts hostname and calls asfhost?
+ def self.asfurl?(url)
+ if url =~ %r{\Ahttps?://(.+?)(/|\z)}i
+ return asfhost?($1)
+ else
+ return true # a relative link
+ end
+ end
+end
+
+if __FILE__ == $0
+ ARGV.each do |arg|
+ p [arg, ASFDOMAIN.asfhost?(arg), ASFDOMAIN.asfurl?(arg)]
+ end
+end
diff --git a/tools/site-scan.rb b/tools/site-scan.rb
index 7e76cd51..1e6a766f 100755
--- a/tools/site-scan.rb
+++ b/tools/site-scan.rb
@@ -13,6 +13,7 @@ require 'json'
require 'whimsy/asf'
require 'whimsy/cache'
require 'whimsy/sitestandards'
+require_relative 'asf-site-check'
# Normalize spaces in text runs
def squash(text)
@@ -146,6 +147,13 @@ def parse(id, site, name)
# THIRD: see if an image has been uploaded
data[:image] = ASF::SiteImage.find(id)
+ # Check for resource loading from non-ASF domains
+ js_urls = doc.xpath('//script/@src').map(&:content).reject {|x| ASFDOMAIN.asfurl? x}
+ css_urls = doc.xpath('//link/@href').map(&:content).reject {|x| ASFDOMAIN.asfurl? x}
+ img_urls = doc.xpath('//img/@src').map(&:content).reject {|x| ASFDOMAIN.asfurl? x}
+ resources = js_urls.size + css_urls.size + img_urls.size
+ data[:resources] = 'Found no external resources' if resources == 0
+
return data
end