You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@brooklyn.apache.org by he...@apache.org on 2016/02/01 19:01:12 UTC
[14/50] [abbrv] brooklyn-client git commit: refactor prefix filtering
and spike the same update-index trick to step 2 initial history clean,
but its not faster here
refactor prefix filtering and spike the same update-index trick to step 2 initial history clean, but its not faster here
Project: http://git-wip-us.apache.org/repos/asf/brooklyn-client/repo
Commit: http://git-wip-us.apache.org/repos/asf/brooklyn-client/commit/4e533c85
Tree: http://git-wip-us.apache.org/repos/asf/brooklyn-client/tree/4e533c85
Diff: http://git-wip-us.apache.org/repos/asf/brooklyn-client/diff/4e533c85
Branch: refs/heads/master
Commit: 4e533c854e9748789484c5ad5bbd75839e19e74c
Parents: 61ef068
Author: Alex Heneveld <al...@cloudsoftcorp.com>
Authored: Wed Dec 16 14:41:02 2015 +0000
Committer: Alex Heneveld <al...@cloudsoftcorp.com>
Committed: Wed Dec 16 16:59:48 2015 +0000
----------------------------------------------------------------------
2-clean-history.sh | 6 +++++-
grep-lines-starting.sh | 20 ++++++++++++++++++++
make-whitelist.sh | 16 +---------------
3 files changed, 26 insertions(+), 16 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/brooklyn-client/blob/4e533c85/2-clean-history.sh
----------------------------------------------------------------------
diff --git a/2-clean-history.sh b/2-clean-history.sh
index dd23a62..779bc17 100755
--- a/2-clean-history.sh
+++ b/2-clean-history.sh
@@ -16,7 +16,11 @@ git checkout master
# now make master reorganised, if reorg branch exists
git reset --hard reorg
-git filter-branch --index-filter "git rm -r --cached --ignore-unmatch $(echo $( cat ${basedir}/big-files-to-remove.txt ))" master ${branches}
+git filter-branch -f --index-filter "git rm -r -q --cached --ignore-unmatch $(echo $( cat ${basedir}/big-files-to-remove.txt ))" master
+## above is slightly faster than below, because (compared w step 3) we have fewer patterns and (compared w step 4) we are benefitting from rm's native pattern matching
+# git filter-branch -f --index-filter \
+# "git ls-files > /tmp/TMP-clean-history-LS ; ${basedir}/grep-lines-starting.sh ${basedir}/big-files-to-remove.txt /tmp/TMP-clean-history-LS | git update-index --force-remove --stdin" \
+# --tag-name-filter cat --prune-empty master ${branches}
# option 2: delete the entire example *if* it contains binaries but keep it if it doesn't - means that the project will suddenly appear in history but should work when it does appear
# (we have gone for option 1, just cutting the big files)
http://git-wip-us.apache.org/repos/asf/brooklyn-client/blob/4e533c85/grep-lines-starting.sh
----------------------------------------------------------------------
diff --git a/grep-lines-starting.sh b/grep-lines-starting.sh
new file mode 100755
index 0000000..df1e29c
--- /dev/null
+++ b/grep-lines-starting.sh
@@ -0,0 +1,20 @@
+
+# efficient way to find lines beginning with any of the given prefixes, sorted
+# much faster than grep -f (with or without -F) for big files because of the line start logic and sort -- O(N log N) rather than O(N^2)
+
+if [ -z "$2" ] ; then echo "Usage: grep-lines-starting.sh <prefix_file> <lines> # to find all lines starting with any prefix in <prefix_file>" ; exit 1 ; fi
+
+PREFIX_FILE=$1
+INPUT=$2
+TMP=/tmp/remove-prefixes-tmp
+
+cat $PREFIX_FILE | awk '{if ($1) print $0"\tMATCH_THIS" }' | cat - $INPUT | sort -u > ${TMP}_merged
+cat ${TMP}_merged | awk -F $'\t' '{
+ if ($2=="MATCH_THIS") {
+ if (!patt || substr($1,0,length(patt))!=patt) { patt=$1; }
+ if (last==patt) { print last; }
+ } else {
+ last=$0;
+ if (patt && substr(last,0,length(patt))==patt) { print last; }
+ } }' | sort -u
+
http://git-wip-us.apache.org/repos/asf/brooklyn-client/blob/4e533c85/make-whitelist.sh
----------------------------------------------------------------------
diff --git a/make-whitelist.sh b/make-whitelist.sh
index 84e6ed8..eb31841 100755
--- a/make-whitelist.sh
+++ b/make-whitelist.sh
@@ -51,21 +51,7 @@ while [ -s $TODO_REMAINING ] ; do
cat ${TODO_HERE}_ids | xargs -L -n100 git show -l99999 -M50 -C90 --name-status --format="ID: %H" | grep -v ^ID: | awk -F $'\t' '{ if ($3) print $3"\t"$2; else print $2; }' | sort -u >> ${TODO_HERE}_allpaths
echo comparing `cat ${TODO_HERE}_allpaths | wc -l` candidate files against paths...
- cat $TODO_REMAINING | awk '{print $0"\tMATCH_THIS" }' | cat - ${TODO_HERE}_allpaths | sort -u > ${TODO_HERE}_merged
- cat ${TODO_HERE}_merged | awk -F $'\t' '{
- if ($2=="MATCH_THIS") {
- if (!patt || substr($1,0,length(patt))!=patt) { patt=$1; }
- if (last1==patt) { print last1; if (last2) print last2; }
- last1="";
- } else {
- last1=$1; last2=$2;
- if (patt && substr(last1,0,length(patt))==patt) { print last1; if (last2) print last2; }
- } }' | sort -u -o ${TODO_HERE}
- # logging for the above, if needed
-# echo MATCHING for $OUTPUT_FILENAME : >> ${ORIG_DIR}/log
-# cat ${TODO_HERE}_merged | awk -F $'\t' '{ if ($2=="MATCH_THIS") { if (!patt || substr($1,0,length(patt))!=patt) { patt=$1; }
-# if (last1==patt) { print "MATCH LAST on "patt" ADDS "last1" "last2; } last1=""; }
-# else { last1=$1; last2=$2; if (patt && substr(last1,0,length(patt))==patt) { print "MATCH NEXT on "patt" ADDS "last1" "last2; } } }' >> ${ORIG_DIR}/log
+ ${ORIG_DIR}/grep-lines-starting.sh ${TODO_REMAINING} ${TODO_HERE}_allpaths | awk -F $'\t' '{print $1; if ($2) print $2;}' | sort -u -o ${TODO_HERE}
comm -23 ${TODO_HERE} $OUTPUT > ${TODO_REMAINING}
cat $OUTPUT ${TODO_HERE} | sort -u -o ${OUTPUT}