From fd1b7e09ada5a4921f271a2339d05c697eb31aa4 Mon Sep 17 00:00:00 2001 From: Sebastian Schuberth Date: Thu, 12 Jan 2017 09:53:15 +0100 Subject: [PATCH 1/2] cli: Fix some minor typos in a comment --- src/scancode/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scancode/cli.py b/src/scancode/cli.py index 85de1edb804..6779f08759c 100644 --- a/src/scancode/cli.py +++ b/src/scancode/cli.py @@ -696,7 +696,7 @@ def save_results(files_count, scanned_files, format, input, output_file): for file_data in scanned_files: file_entry = File(file_data['path']) - # FIXME: should we really compue the checcksum here rather than get it from the scan? + # FIXME: should we really compute the checksum here rather than getting it from the scan? file_entry.chk_sum = Algorithm('SHA1', file_entry.calc_chksum()) for file_license in file_data['licenses']: spdx_id = file_license.get('spdx_license_key') From 85327e5bbf7bb335e8e86914c0ff2049f942a1dd Mon Sep 17 00:00:00 2001 From: Sebastian Schuberth Date: Thu, 12 Jan 2017 13:48:04 +0100 Subject: [PATCH 2/2] cli: Read a file's SHA1 from the cache instead of recalculating it Closes #448. --- src/scancode/cli.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/src/scancode/cli.py b/src/scancode/cli.py index 6779f08759c..badbdcdb9c0 100644 --- a/src/scancode/cli.py +++ b/src/scancode/cli.py @@ -312,6 +312,10 @@ def scancode(ctx, input, output_file, copyright, license, package, license = True package = True + # A hack to force info being exposed for SPDX output in order to reuse calculated file SHA1s. + if format in ('spdx-tv', 'spdx-rdf'): + info = True + scans_cache_class = get_scans_cache_class() try: files_count, results = scan(input_path=input, @@ -678,7 +682,7 @@ def save_results(files_count, scanned_files, format, input, output_file): output_file.write(unicode(json.dumps(meta, separators=(',', ':'), iterable_as_array=True, encoding='utf-8'))) output_file.write('\n') - elif format == 'spdx-tv' or format == 'spdx-rdf': + elif format in ('spdx-tv', 'spdx-rdf'): from spdx.checksum import Algorithm from spdx.creationinfo import Tool from spdx.document import Document, License @@ -695,10 +699,15 @@ def save_results(files_count, scanned_files, format, input, output_file): doc.package = Package(input, NoAssert()) for file_data in scanned_files: - file_entry = File(file_data['path']) - # FIXME: should we really compute the checksum here rather than getting it from the scan? - file_entry.chk_sum = Algorithm('SHA1', file_entry.calc_chksum()) - for file_license in file_data['licenses']: + file_sha1 = file_data.get('sha1') + if not file_sha1: + # Skip directories. + continue + + file_entry = File(file_data.get('path')) + file_entry.chk_sum = Algorithm('SHA1', file_sha1) + + for file_license in file_data.get('licenses'): spdx_id = file_license.get('spdx_license_key') # TODO: we should create a "LicenseRef:xxx" identifier # if the license is not known to SPDX