From fd1b7e09ada5a4921f271a2339d05c697eb31aa4 Mon Sep 17 00:00:00 2001
From: Sebastian Schuberth <sschuberth@gmail.com>
Date: Thu, 12 Jan 2017 09:53:15 +0100
Subject: [PATCH 1/2] cli: Fix some minor typos in a comment

---
 src/scancode/cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/scancode/cli.py b/src/scancode/cli.py
index 85de1edb804..6779f08759c 100644
--- a/src/scancode/cli.py
+++ b/src/scancode/cli.py
@@ -696,7 +696,7 @@ def save_results(files_count, scanned_files, format, input, output_file):
 
         for file_data in scanned_files:
             file_entry = File(file_data['path'])
-            # FIXME: should we really compue the checcksum here rather than get it from the scan?
+            # FIXME: should we really compute the checksum here rather than getting it from the scan?
             file_entry.chk_sum = Algorithm('SHA1', file_entry.calc_chksum())
             for file_license in file_data['licenses']:
                 spdx_id = file_license.get('spdx_license_key')

From 85327e5bbf7bb335e8e86914c0ff2049f942a1dd Mon Sep 17 00:00:00 2001
From: Sebastian Schuberth <sschuberth@gmail.com>
Date: Thu, 12 Jan 2017 13:48:04 +0100
Subject: [PATCH 2/2] cli: Read a file's SHA1 from the cache instead of
 recalculating it

Closes #448.
---
 src/scancode/cli.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/src/scancode/cli.py b/src/scancode/cli.py
index 6779f08759c..badbdcdb9c0 100644
--- a/src/scancode/cli.py
+++ b/src/scancode/cli.py
@@ -312,6 +312,10 @@ def scancode(ctx, input, output_file, copyright, license, package,
         license = True
         package = True
 
+    # A hack to force info being exposed for SPDX output in order to reuse calculated file SHA1s.
+    if format in ('spdx-tv', 'spdx-rdf'):
+        info = True
+
     scans_cache_class = get_scans_cache_class()
     try:
         files_count, results = scan(input_path=input,
@@ -678,7 +682,7 @@ def save_results(files_count, scanned_files, format, input, output_file):
             output_file.write(unicode(json.dumps(meta, separators=(',', ':'), iterable_as_array=True, encoding='utf-8')))
         output_file.write('\n')
 
-    elif format == 'spdx-tv' or format == 'spdx-rdf':
+    elif format in ('spdx-tv', 'spdx-rdf'):
         from spdx.checksum import Algorithm
         from spdx.creationinfo import Tool
         from spdx.document import Document, License
@@ -695,10 +699,15 @@ def save_results(files_count, scanned_files, format, input, output_file):
         doc.package = Package(input, NoAssert())
 
         for file_data in scanned_files:
-            file_entry = File(file_data['path'])
-            # FIXME: should we really compute the checksum here rather than getting it from the scan?
-            file_entry.chk_sum = Algorithm('SHA1', file_entry.calc_chksum())
-            for file_license in file_data['licenses']:
+            file_sha1 = file_data.get('sha1')
+            if not file_sha1:
+                # Skip directories.
+                continue
+
+            file_entry = File(file_data.get('path'))
+            file_entry.chk_sum = Algorithm('SHA1', file_sha1)
+
+            for file_license in file_data.get('licenses'):
                 spdx_id = file_license.get('spdx_license_key')
                 # TODO: we should create a "LicenseRef:xxx" identifier 
                 # if the license is not known to SPDX