You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@brooklyn.apache.org by he...@apache.org on 2016/02/01 19:01:07 UTC
[09/50] [abbrv] brooklyn-client git commit: optimize creation of whitelist files

optimize creation of whitelist files

about 50x faster i'd say. now it gets the commit id's and parses them for files,
rather than following every file. output is similar;
imperfect because of how git infers moves, slightly different mistakes but nothing egregious,
and if anything this is slightly better


Project: http://git-wip-us.apache.org/repos/asf/brooklyn-client/repo
Commit: http://git-wip-us.apache.org/repos/asf/brooklyn-client/commit/7b346367
Tree: http://git-wip-us.apache.org/repos/asf/brooklyn-client/tree/7b346367
Diff: http://git-wip-us.apache.org/repos/asf/brooklyn-client/diff/7b346367

Branch: refs/heads/master
Commit: 7b3463671f58d349c024e64a2c8d6021ab6ea5e2
Parents: 4904ee0
Author: Alex Heneveld <al...@cloudsoftcorp.com>
Authored: Wed Dec 16 12:28:47 2015 +0000
Committer: Alex Heneveld <al...@cloudsoftcorp.com>
Committed: Wed Dec 16 14:31:22 2015 +0000

----------------------------------------------------------------------
 3-create-full-whitelists.sh |  8 ++---
 4-make-new-repos.sh         |  2 +-
 make-whitelist.sh           | 67 ++++++++++++++++++++++++++--------------
 uber-repo-whitelist.txt     |  2 ++
 4 files changed, 50 insertions(+), 29 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/brooklyn-client/blob/7b346367/3-create-full-whitelists.sh
----------------------------------------------------------------------
diff --git a/3-create-full-whitelists.sh b/3-create-full-whitelists.sh
index 4b3bf50..337e242 100755
--- a/3-create-full-whitelists.sh
+++ b/3-create-full-whitelists.sh
@@ -5,12 +5,12 @@ set -e
 
 . env.sh
 
-for x in $PROJS ; do
-  ./make-whitelist.sh incubator-brooklyn/ "brooklyn-$x $(cat common-whitelist.txt) $(cat $x-whitelist.txt)" $x-whitelist.full.gen.txt
+for x in $PROJS uber-repo ; do
+  echo brooklyn-$x | cat - $x-whitelist.txt common-whitelist.txt > TMP-whitelist-$x.gen.txt
+  ./make-whitelist.sh incubator-brooklyn/ TMP-whitelist-$x.gen.txt $x-whitelist.full.gen.txt
+  rm TMP-whitelist-$x.gen.txt
 done
 
-./make-whitelist.sh incubator-brooklyn/ "brooklyn $(cat common-whitelist.txt) README.md" brooklyn-uber-repo-whitelist.full.gen.txt
-
 # finally anything which isn't in any full whitelist, put into unclaimed-whitelist.gen.txt
 
 pushd incubator-brooklyn

http://git-wip-us.apache.org/repos/asf/brooklyn-client/blob/7b346367/4-make-new-repos.sh
----------------------------------------------------------------------
diff --git a/4-make-new-repos.sh b/4-make-new-repos.sh
index ccfca3e..267427f 100755
--- a/4-make-new-repos.sh
+++ b/4-make-new-repos.sh
@@ -90,5 +90,5 @@ for x in $PROJS ; do
   do_repo_w_whitelist brooklyn-$x $x-whitelist.full.gen.txt
 done
 
-do_repo_w_whitelist brooklyn brooklyn-uber-repo-whitelist.gen.txt
+do_repo_w_whitelist brooklyn uber-repo-whitelist.full.gen.txt
 

http://git-wip-us.apache.org/repos/asf/brooklyn-client/blob/7b346367/make-whitelist.sh
----------------------------------------------------------------------
diff --git a/make-whitelist.sh b/make-whitelist.sh
index b03adac..84e6ed8 100755
--- a/make-whitelist.sh
+++ b/make-whitelist.sh
@@ -1,12 +1,10 @@
 
 # inputs
 
-# TODO take inputs, including OUTPUT_FILENAME
-
-if [ -z "$3" ]; then echo "Usage: make-whitelist.sh REPO_DIR DIRS_TO_FOLLOW OUTPUT_FILENAME" ; exit 1 ; fi
+if [ -z "$3" ]; then echo "Usage: make-whitelist.sh REPO_DIR PATH_PREFIX_FILE OUTPUT_FILENAME" ; exit 1 ; fi
 
 export REPO=$1
-export DIRS=$2
+export PREFIX_FILE=$2
 export OUTPUT_FILENAME=$3
 
 # output
@@ -18,39 +16,60 @@ export OUTPUT=${ORIG_DIR}/${OUTPUT_FILENAME}
 # working
 
 # file/paths we have left to look at, built up for the next cycle on one cycle,
-# starting with the DIRS
+# starting with the PREFIX_FILE
 export TODO_REMAINING=${ORIG_DIR}/TODO-remaining
 
 # file/paths encountered on one cycle
 export TODO_HERE=${ORIG_DIR}/TODO-here
 
-
-rm $OUTPUT
-for x in $DIRS ; do echo $x >> $OUTPUT ; done
+sort -u -o $OUTPUT $PREFIX_FILE
 cp $OUTPUT $TODO_REMAINING
+SAMPLE_PATHS=`head -4 $PREFIX_FILE`" and "`( gshuf $PREFIX_FILE 2> /dev/null || echo "maybe others" ) | head -4`
 
 pushd $REPO > /dev/null
 
-echo scanning $REPO for all files
+echo scanning $REPO for relevant files in history for $OUTPUT_FILENAME starting with `cat $TODO_REMAINING | wc -l` paths including $SAMPLE_PATHS
 
 while [ -s $TODO_REMAINING ] ; do
 
-  echo current scan has `wc $TODO_REMAINING | awk '{print $1}'` paths including `head -1 $TODO_REMAINING`
-  rm -f $TODO_HERE
-  touch $TODO_HERE
+  echo current pass has `cat $TODO_REMAINING | wc -l` paths including `( gshuf $TODO_REMAINING 2> /dev/null || cat $TODO_REMAINING ) | head -4`
 
-  for x in `cat $TODO_REMAINING` ; do
-    # NB: this doesn't work with spsces in the filename; we just have a few though and they're manually added
-    git log --format='%H' --name-status --follow -- $x | awk '{if ($3) print $3; if ($2) print $2;}' | sort -u | cat $TODO_HERE - > ${TODO_HERE}2
-    mv ${TODO_HERE}2 ${TODO_HERE}
-  done
-  cat ${TODO_HERE} | sort -u > ${TODO_HERE}2
-  mv ${TODO_HERE}2 ${TODO_HERE}
+#  echo PICKED UP for $OUTPUT_FILENAME : >> ${ORIG_DIR}/log
+#  cat $TODO_REMAINING >> ${ORIG_DIR}/log
+
+  rm -f $TODO_HERE
 
-  diff --new-line-format="" --unchanged-line-format="" ${TODO_HERE} $OUTPUT > ${TODO_HERE}_new
-  cat $OUTPUT ${TODO_HERE}_new | sort -u > ${OUTPUT}2
-  mv ${OUTPUT}2 ${OUTPUT}
-  mv ${TODO_HERE}_new $TODO_REMAINING
+  echo collecting relevant commits...
+  cat $TODO_REMAINING | xargs -L -n100 git log --format='%H' --diff-filter=A -- >> ${TODO_HERE}_ids
+
+  sort -u ${TODO_HERE}_ids -o ${TODO_HERE}_ids
+#  echo IDS | cat - ${TODO_HERE}_ids >> ${ORIG_DIR}/log
+
+  rm -f ${TODO_HERE}_allpaths
+  echo gathering files from `cat ${TODO_HERE}_ids | wc -l` commits...
+  # 50% match is a bit low but better safe than sorry for moves; for copies we go higher
+  cat ${TODO_HERE}_ids | xargs -L -n100 git show -l99999 -M50 -C90 --name-status --format="ID: %H" | grep -v ^ID: | awk -F $'\t' '{ if ($3) print $3"\t"$2; else print $2; }' | sort -u >> ${TODO_HERE}_allpaths
+
+  echo comparing `cat ${TODO_HERE}_allpaths | wc -l` candidate files against paths...
+  cat $TODO_REMAINING | awk '{print $0"\tMATCH_THIS" }' | cat - ${TODO_HERE}_allpaths | sort -u > ${TODO_HERE}_merged
+  cat ${TODO_HERE}_merged | awk -F $'\t' '{ 
+    if ($2=="MATCH_THIS") { 
+      if (!patt || substr($1,0,length(patt))!=patt) { patt=$1; } 
+      if (last1==patt) { print last1; if (last2) print last2; } 
+      last1=""; 
+    } else { 
+      last1=$1; last2=$2; 
+      if (patt && substr(last1,0,length(patt))==patt) { print last1; if (last2) print last2; } 
+    } }' | sort -u -o ${TODO_HERE}
+   # logging for the above, if needed
+#  echo MATCHING for $OUTPUT_FILENAME : >> ${ORIG_DIR}/log
+#  cat ${TODO_HERE}_merged | awk -F $'\t' '{ if ($2=="MATCH_THIS") { if (!patt || substr($1,0,length(patt))!=patt) { patt=$1; } 
+#      if (last1==patt) { print "MATCH LAST on "patt" ADDS "last1" "last2; } last1=""; }
+#    else { last1=$1; last2=$2; if (patt && substr(last1,0,length(patt))==patt) { print "MATCH NEXT on "patt" ADDS "last1" "last2; } } }' >> ${ORIG_DIR}/log
+
+  comm -23 ${TODO_HERE} $OUTPUT > ${TODO_REMAINING}
+  cat $OUTPUT ${TODO_HERE} | sort -u -o ${OUTPUT}
+  rm ${TODO_HERE}_*
 
 done
 
@@ -59,5 +78,5 @@ popd > /dev/null
 rm ${TODO_REMAINING}
 rm ${TODO_HERE}
 
-echo completed scan of $REPO, history has `wc ${OUTPUT} | awk '{print $1}'` files
+echo completed scan of $REPO in $OUTPUT_FILENAME, relevant history has `wc ${OUTPUT} | awk '{print $1}'` files
 

http://git-wip-us.apache.org/repos/asf/brooklyn-client/blob/7b346367/uber-repo-whitelist.txt
----------------------------------------------------------------------
diff --git a/uber-repo-whitelist.txt b/uber-repo-whitelist.txt
new file mode 100644
index 0000000..f2a1baf
--- /dev/null
+++ b/uber-repo-whitelist.txt
@@ -0,0 +1,2 @@
+README.md
+brooklyn/