You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@beam.apache.org by am...@apache.org on 2020/09/21 18:11:40 UTC

[beam] branch master updated: [BEAM-9136] Add python dependency license CSV for license URL and type

This is an automated email from the ASF dual-hosted git repository.

amyrvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/beam.git


The following commit(s) were added to refs/heads/master by this push:
     new 8fe5805  [BEAM-9136] Add python dependency license CSV for license URL and type
     new 9d2fce7  Merge pull request #12879 from alanmyrvold/py-license-list
8fe5805 is described below

commit 8fe5805454d957d40127efee5236461712642dc3
Author: Alan Myrvold <am...@google.com>
AuthorDate: Fri Sep 18 23:54:44 2020 +0000

    [BEAM-9136] Add python dependency license CSV for license URL and type
---
 sdks/python/container/Dockerfile                   |  6 ++-
 .../container/license_scripts/pull_licenses_py.py  | 50 ++++++++++++++++++++--
 2 files changed, 52 insertions(+), 4 deletions(-)

diff --git a/sdks/python/container/Dockerfile b/sdks/python/container/Dockerfile
index a7c498b..9d1d3a8 100644
--- a/sdks/python/container/Dockerfile
+++ b/sdks/python/container/Dockerfile
@@ -45,7 +45,8 @@ RUN \
     # Check that the fast implementation of protobuf is used.
     python -c "from google.protobuf.internal import api_implementation; assert api_implementation._default_implementation_type == 'cpp'; print ('Verified fast protobuf used.')" && \
     # Remove pip cache.
-    rm -rf /root/.cache/pip
+    rm -rf /root/.cache/pip && \
+    rm -rf /tmp/base_image_requirements.txt
 
 # Configure ccache prior to installing the SDK.
 RUN ln -s /usr/bin/ccache /usr/local/bin/gcc
@@ -76,6 +77,9 @@ RUN if [ "$pull_licenses" = "true" ] ; then \
     rm -rf /opt/apache/beam/third_party_licenses ; \
    fi
 
+# Remove license scripts
+RUN rm -rf /tmp/license_scripts
+
 # Log complete list of what exact packages and versions are installed.
 RUN pip freeze --all
 # Make sure there are no conflicting dependencies.
diff --git a/sdks/python/container/license_scripts/pull_licenses_py.py b/sdks/python/container/license_scripts/pull_licenses_py.py
index c987015..75f629d 100644
--- a/sdks/python/container/license_scripts/pull_licenses_py.py
+++ b/sdks/python/container/license_scripts/pull_licenses_py.py
@@ -19,6 +19,7 @@
 A script to pull licenses for Python.
 The script is executed within Docker.
 """
+import csv
 import json
 import logging
 import os
@@ -30,6 +31,9 @@ import traceback
 import yaml
 
 from future.moves.urllib.request import urlopen
+from future.moves.urllib.parse import urlparse
+from future.moves.urllib.parse import urljoin
+
 from tenacity import retry
 from tenacity import stop_after_attempt
 from tenacity import wait_exponential
@@ -42,7 +46,7 @@ def run_bash_command(command):
 
 
 def run_pip_licenses():
-  command = 'pip-licenses --with-license-file --format=json'
+  command = 'pip-licenses --with-license-file --with-urls --from=mixed --ignore apache-beam --format=json'
   dependencies = run_bash_command(command)
   return json.loads(dependencies)
 
@@ -53,7 +57,7 @@ def copy_license_files(dep):
   if source_license_file.lower() == 'unknown':
     return False
   name = dep['Name'].lower()
-  dest_dir = '/'.join([LICENSE_DIR, name])
+  dest_dir = os.path.join(LICENSE_DIR, name)
   try:
     os.mkdir(dest_dir)
     shutil.copy(source_license_file, dest_dir + '/LICENSE')
@@ -84,7 +88,7 @@ def pull_from_url(dep, configs):
   '''
   if dep in configs:
     config = configs[dep]
-    dest_dir = '/'.join([LICENSE_DIR, dep])
+    dest_dir = os.path.join(LICENSE_DIR, dep)
     cur_temp_dir = tempfile.mkdtemp()
 
     try:
@@ -116,6 +120,43 @@ def pull_from_url(dep, configs):
       shutil.rmtree(cur_temp_dir)
 
 
+def license_url(name, project_url, dep_config):
+  '''
+  Gets the license URL for a dependency, either from the parsed yaml or,
+  if it is github, by looking for a license file in the repo.
+  '''
+  configs = dep_config['pip_dependencies']
+  if name.lower() in configs:
+    return configs[name.lower()]['license']
+  p = urlparse(project_url)
+  if p.netloc != "github.com":
+    return project_url
+  raw = "https://raw.githubusercontent.com"
+  path = p.path
+  if not path.endswith("/"):
+    path = path + "/"
+  for license in ("LICENSE", "LICENSE.txt", "LICENSE.md", "LICENSE.rst", "COPYING"):
+    try:
+      url = raw + urljoin(path,"master/"+license)
+      with urlopen(url) as a:
+        if a.getcode() == 200:
+          return url
+    except:
+      pass
+  return project_url
+
+
+def save_license_list(csv_filename, dependencies, dep_config):
+  '''
+  Save the names, URLs, and license type for python dependency licenses in a CSV file.
+  '''
+  with open(csv_filename, mode='w') as f:
+    writer = csv.writer(f)
+    for dep in dependencies:
+      url = license_url(dep['Name'], dep['URL'], dep_config)
+      writer.writerow([dep['Name'], url, dep['License']])
+
+
 if __name__ == "__main__":
   no_licenses = []
   logging.getLogger().setLevel(logging.INFO)
@@ -124,6 +165,9 @@ if __name__ == "__main__":
     dep_config = yaml.full_load(file)
 
   dependencies = run_pip_licenses()
+  csv_filename = os.path.join(LICENSE_DIR, 'python-licenses.csv')
+  save_license_list(csv_filename, dependencies, dep_config)
+
   # add licenses for pip installed packages.
   # try to pull licenses with pip-licenses tool first, if no license pulled,
   # then pull from URLs.