You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@whimsical.apache.org by se...@apache.org on 2017/04/27 21:17:41 UTC

[whimsy] branch master updated: Tidy up output a bit more

This is an automated email from the ASF dual-hosted git repository.

sebb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/whimsy.git

The following commit(s) were added to refs/heads/master by this push:
       new  9b9dd73   Tidy up output a bit more
9b9dd73 is described below

commit 9b9dd73bc2f7dc4499f3b3de8fc2e25fbb13db4d
Author: Sebb <se...@apache.org>
AuthorDate: Thu Apr 27 22:17:40 2017 +0100

    Tidy up output a bit more
---
 tools/site-scan.rb | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/tools/site-scan.rb b/tools/site-scan.rb
index 5443ad0..176f015 100755
--- a/tools/site-scan.rb
+++ b/tools/site-scan.rb
@@ -28,6 +28,10 @@ def fetch(uri)
   end
 end
 
+def squash(text)
+  text.scrub.gsub(/[[:space:]]+/, ' ').strip
+end
+
 def parse(site, name)
   uri, request, response = fetch(site)
   doc = Nokogiri::HTML(response.body)
@@ -52,7 +56,7 @@ def parse(site, name)
       if img
         data[:foundation] = uri + img['src'].strip
       else
-        data[:foundation] = a.text 
+        data[:foundation] = squash(a.text) 
       end
     end
 
@@ -86,18 +90,19 @@ def parse(site, name)
   end
   doc.traverse do |node|
     next unless node.is_a?(Nokogiri::XML::Text)
-    # scrub is needed as some sites have invalid UTF-8 bytes
-    # gsub needed because we may need to match multiple words
-    txt = node.text.scrub.gsub(/[[:space:]]+/, ' ')
-    # trademarks may appear twice. TODO use array?
-    if txt =~ / Apache feather\b/ and not data[:trademarks]
+
+    txt = squash(node.text)
+
+    if txt =~ /\btrademarks\b/ and not data[:trademarks]
       t, p = getText(txt, node)
-      data[:trademarks] = t
+      # drop previous text if it looks like Copyright sentence
+      data[:trademarks] = t.sub(/^.*?Copyright .+? Foundation[.]?/,'').strip
       data[:tradeparent] = p if p
     end
     if txt =~ /Copyright / or txt =~ /�/
       t, p = getText(txt, node)
-      data[:copyright] = t
+      # drop text around the Copyright (or the symbol)
+      data[:copyright] = t.sub(/^.*?((Copyright|�) .+? Foundation[.]?).*/,'\1').strip
       data[:copyparent] = p if p
     end
   end
@@ -109,15 +114,13 @@ def getText(txt, node)
   parent = nil # debug to show where parent needed to be fetched
   if not txt =~ /Apache Software Foundation/i # have we got all the text?
     if node.parent.name == 'a' # e.g. whimsical. such parents don't have extra text.
-      txt = node.parent.parent.text.scrub
+      txt = squash(node.parent.parent.text)
     else
-      txt = node.parent.text.scrub
+      txt = squash(node.parent.text)
     end
     parent = true
   end
-  # TODO strip extra text where possible.
-  # Note: both copyright and trademark can be in same text (e.g. Cayenne)
-  return txt.gsub(/[[:space:]]+/, ' ').strip, parent
+  return txt, parent
 end
 
 $verbose = ARGV.delete '--verbose'

-- 
To stop receiving notification emails like this one, please contact
['"commits@whimsical.apache.org" <co...@whimsical.apache.org>'].