You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@drat.apache.org by GitBox <gi...@apache.org> on 2018/08/14 13:51:12 UTC

[GitHub] chrismattmann closed pull request #153: This is the code for breaking dratstat

chrismattmann closed pull request #153: This is the code for breaking dratstat
URL: https://github.com/apache/drat/pull/153
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/distribution/src/main/resources/bin/dratstats.py b/distribution/src/main/resources/bin/dratstats.py
index cae5c897..43ff1c86 100644
--- a/distribution/src/main/resources/bin/dratstats.py
+++ b/distribution/src/main/resources/bin/dratstats.py
@@ -268,7 +268,7 @@ def run(repos_list, output_dir):
 			print("\nOODT Started: OK\n")
 
 			print('Adding repository: '+str(rep)+' to Solr')
-			index_solr(json.dumps([rep]))
+			# index_solr(json.dumps([rep]))
 
 
 			print("\nRunning DRAT on " + rep["repo"] + " ...\n")
@@ -295,178 +295,9 @@ def run(repos_list, output_dir):
 					wait_for_job("urn:drat:MimePartitioner")
 					wait_for_job("urn:drat:RatCodeAudit")
 					stats['map_end'] = current_datetime()
-
-					if retval:
-						time.sleep(5)
-						stats['reduce_start'] = current_datetime()
-						
-						# Extract data from RatAggregate File
-						totalNotes = 0
-						totalBinaries = 0
-						totalArchives = 0
-						totalStandards = 0
-						totalApache = 0
-						totalGenerated = 0
-						totalUnknown = 0
-
-						rat_dir = os.getenv("DRAT_HOME") + "/data/archive/rat"
-
-						# Iterate over all RAT log files 
-						for root, dirs, files in os.walk(rat_dir):
-							for filename in files:
-								if filename.endswith(".log"):
-									(notes, binaries, archives,standards,apachelicensed,generated,unknown) = parseFile(os.path.join(root, filename))
-									totalNotes = totalNotes + notes
-									totalBinaries = totalBinaries + binaries
-									totalArchives = totalArchives + archives
-									totalStandards = totalStandards + standards
-									totalApache = totalApache + apachelicensed
-									totalGenerated = totalGenerated + generated
-									totalUnknown = totalUnknown + unknown
-
-						stats["license_Notes"] = totalNotes
-						stats["license_Binaries"] = totalBinaries
-						stats["license_Archives"] = totalArchives
-						stats["license_Standards"] = totalStandards
-						stats["license_Apache"] = totalApache
-						stats["license_Generated"] = totalGenerated
-						stats["license_Unknown"] = totalUnknown
-
-						stats['reduce_end'] = current_datetime()
-						print "\nDRAT Scan Completed: OK\n"
-
-			time.sleep(5)
-
-			if retval:
-				# Copy Data with datetime variables above, extract output from RatAggregate file, extract data from Solr Core
-				printnow ("\nCopying data to Solr and Output Directory...\n")
-
-				# Extract data from Solr
-				neg_mimetype = ["image", "application", "text", "video", "audio", "message", "multipart"]
-				connection = urllib2.urlopen(os.getenv("SOLR_URL") + "/drat/select?q=*%3A*&rows=0&facet=true&facet.field=mimetype&wt=python&indent=true")
-				
-				response = eval(connection.read())
-				mime_count = response["facet_counts"]["facet_fields"]["mimetype"]
-
-				for i in range(0, len(mime_count), 2):
-					if mime_count[i].split("/")[0] not in neg_mimetype:
-						stats["mime_" + mime_count[i]] = mime_count[i + 1]
-
-
-				# Count the number of files
-				stats["files"] = count_num_files(rep["repo"], ".git")
-
-				# Write data into Solr
-				stats["type"] = 'software'
-				stats_data = []
-				stats_data.append(stats)
-				json_data = json.dumps(stats_data)
-				index_solr(json_data)
-
-				# Parse RAT logs
-				rat_logs_dir = os.getenv("DRAT_HOME") + "/data/archive/rat/*/*.log"
-				rat_license = {}
-				rat_header = {}
-				for filename in glob.glob(rat_logs_dir):
-					#print('=' * 20)
-					l = 0
-					h = 0
-					cur_file = ''
-					cur_header = ''
-					cur_section = ''
-					parsedHeaders = False
-					parsedLicenses = False
-					
-					with open(filename, 'rb') as f:
-						printnow('Parsing rat log: ['+filename+']')
-						for line in f:
-							if '*****************************************************' in line:
-								l = 0
-								h = 0
-								if cur_section == 'licenses':
-									parsedLicenses = True
-								if cur_section == 'headers':
-									parsedHeaders = True
-									
-								cur_file = ''
-								cur_header = ''
-								cur_section = ''
-							if line.startswith('  Files with Apache') and not parsedLicenses:
-								cur_section = 'licenses'
-							if line.startswith(' Printing headers for ') and not parsedHeaders:
-								cur_section = 'headers'
-							if cur_section == 'licenses':
-								l += 1
-								if l > 4:
-									line = line.strip()
-									if line:
-										print("File: %s with License Line: %s" % (filename, line))
-										li = parse_license(line)
-										rat_license[li[0]] = li[1]
-									 	print(li)
-							if cur_section == 'headers':
-								if '=====================================================' in line or '== File:' in line:
-									h += 1
-								if h == 2:
-									cur_file = line.split("/")[-1].strip()
-								if h == 3:
-									cur_header += line
-								if h == 4:
-									rat_header[cur_file] = cur_header.split("\n", 1)[1]
-									cur_file = ''
-									cur_header = ''
-									h = 1
-					if h == 3:
-						rat_header[cur_file] = cur_header.split("\n", 1)[1]
-					parsedHeaders = True
-					parsedLicenses = True
-
-				# Index RAT logs into Solr
-				connection = urllib2.urlopen(os.getenv("SOLR_URL") +
-											 "/drat/select?q=*%3A*&fl=filename%2Cfilelocation%2Cmimetype&wt=python&rows="
-											 + str(stats["files"]) +"&indent=true")
-				response = eval(connection.read())
-				docs = response['response']['docs']
-				file_data = []
-				batch = 100
-				dc = 0
-				
-				for doc in docs:
-					fdata = {}
-					fdata['id'] = os.path.join(doc['filelocation'][0], doc['filename'][0])
-					m = md5.new()
-					m.update(fdata['id'])
-					hashId = m.hexdigest()
-					fileId = hashId+"-"+doc['filename'][0]
-
-					if fileId not in rat_license:
-						print "File: "+str(fdata['id'])+": ID: ["+fileId+"] not present in parsed licenses => Likely file copying issue. Skipping."
-						continue #handle issue with DRAT #93
-					
-					fdata["type"] = 'file'
-					fdata['parent'] = rep["repo"]
-					fdata['mimetype'] = doc['mimetype'][0]
-					fdata['license'] = rat_license[fileId]
-					if fileId in rat_header:
-						fdata['header'] = rat_header[fileId]
-					file_data.append(fdata)
-					dc += 1
-					if dc % batch == 0:
-						json_data = json.dumps(file_data)
-						index_solr(json_data)
-						file_data = []
-				if dc % batch != 0:
-					json_data = json.dumps(file_data)
-					index_solr(json_data)
-
-				# Copying data to Output Directory
-				repos_out = output_dir + "/" + normalize_path(rep["repo"])
-				shutil.copytree(os.getenv("DRAT_HOME") + "/data", repos_out)
-				print("\nData copied to Solr and Output Directory: OK\n")
-
-			else:
-				print ("\nDRAT Scan Completed: Resulted in Error\n")
-
+					print ("\nwaiting for Rat Aggregator...\n")
+					wait_for_job("urn:drat:RatAggregator")
+			
 
 			time.sleep(5)
 			print ("\nStopping OODT...\n")
diff --git a/nohup.out b/nohup.out
new file mode 100644
index 00000000..bb32b5eb
--- /dev/null
+++ b/nohup.out
@@ -0,0 +1,3 @@
+Started dynamic workflow with id '6453cca6-9f30-11e8-b99d-f5018c8e9233'
+
+Navigate to http://localhost:8080/opsui/ to view the OODT browser and http://localhost:8080/solr to view the Solr catalog.
diff --git a/webapps/proteus-new/src/main/webapp/resources/src/components/statisticscomp.vue b/webapps/proteus-new/src/main/webapp/resources/src/components/statisticscomp.vue
index 1280e9ce..adebf8b2 100644
--- a/webapps/proteus-new/src/main/webapp/resources/src/components/statisticscomp.vue
+++ b/webapps/proteus-new/src/main/webapp/resources/src/components/statisticscomp.vue
@@ -156,7 +156,7 @@ the License.
         return this.stat.crawledfiles/this.stat.numOfFiles *100;
       },
       indexingprogress(){
-        return this.stat.indexedfiles/this.stat.numberOfFiles * 100;
+        return this.stat.indexedfiles/this.stat.numOfFiles * 100;
       }
     }
 }


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services