Skip to content

Commit 4ce6c8a

Browse files
committed
Scancode: Fix false positive reported by scancode output analyser script
ScanCode can possibly return many licenses found for a single file scanned. This commit ensures that the file is not reported as lacking a permissive license if at least one license found in it is permissive. Previously the script was reporting an issue if it found at least one license in a file that was not permissive. Additionally catch more errors and provide specific details about failures. Provide unitest.
1 parent 6fa88f4 commit 4ce6c8a

File tree

7 files changed

+1277
-1244
lines changed

7 files changed

+1277
-1244
lines changed

.travis.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -70,18 +70,18 @@ matrix:
7070
| ( grep -v '^tools/test/toolchains/api_test.py' || true ) \
7171
| while read file; do cp --parents "${file}" SCANCODE; done
7272
- scancode -l --json-pp scancode.json SCANCODE
73-
- python ./tools/test/travis-ci/scancode-evaluate.py -f scancode.json || true
73+
- python ./tools/test/travis-ci/scancode-evaluate.py scancode.json || true
7474
# run the same but for new files. All new files must have SPDX
7575
- >-
7676
git diff --name-only --diff-filter=A FETCH_HEAD..HEAD \
7777
| ( grep '.\(c\|cpp\|h\|hpp\|py\)$' || true ) \
7878
| ( grep -v '^tools/test/toolchains/api_test.py' || true ) \
7979
| while read file; do cp --parents "${file}" SCANCODE_NEW_FILES; done
8080
- scancode -l --json-pp scancode_new_files.json SCANCODE_NEW_FILES
81-
- python ./tools/test/travis-ci/scancode-evaluate.py -f scancode_new_files.json || true
81+
- python ./tools/test/travis-ci/scancode-evaluate.py scancode_new_files.json || true
8282
- cat scancode-evaluate.log
8383
- COUNT=$(cat scancode-evaluate.log | grep 'File:' | wc -l) || true
84-
- python ./tools/test/travis-ci/scancode-evaluate.py -f scancode_new_files.json
84+
- python ./tools/test/travis-ci/scancode-evaluate.py scancode_new_files.json
8585
- cat scancode-evaluate.log
8686
- COUNT_NEW_FILES=$(cat scancode-evaluate.log | grep 'File:' | wc -l) || true
8787
- |

tools/test/travis-ci/scancode-evaluate.py

Lines changed: 111 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -13,137 +13,152 @@
1313
distributed under the License is distributed on an "AS IS" BASIS,
1414
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1515
See the License for the specific language governing permissions and
16-
limitations
16+
limitations
1717
"""
1818

19-
# Asumptions for this script:
20-
# 1. directory_name is scanned directory.
21-
# Files are copied to this directory with full tree. As result, if we find
22-
# license offender, we can have full path (just scrape directory_name). We do this
23-
# magic because scancode allows to scan directories/one file.
24-
# 2. SPDX and license text is a must for all code files
25-
26-
import json
2719
import argparse
28-
import sys
29-
import os.path
20+
import json
3021
import logging
22+
import os.path
3123
import re
32-
33-
userlog = logging.getLogger("scancode-evaluate")
34-
userlog.setLevel(logging.INFO)
35-
logfile = os.path.join(os.getcwd(), 'scancode-evaluate.log')
36-
log_file_handler = logging.FileHandler(logfile, mode='w')
37-
userlog.addHandler(log_file_handler)
24+
import sys
25+
from enum import Enum
3826

3927
MISSING_LICENSE_TEXT = "Missing license header"
40-
MISSING_PERMISIVE_LICENSE_TEXT = "Non-permissive license"
28+
MISSING_PERMISSIVE_LICENSE_TEXT = "Non-permissive license"
4129
MISSING_SPDX_TEXT = "Missing SPDX license identifier"
4230

43-
def license_check(directory_name, file):
44-
""" Check licenses in the scancode json file for specified directory
31+
userlog = logging.getLogger("scancode-evaluate")
32+
33+
class ReturnCode(Enum):
34+
"""Return codes."""
35+
36+
SUCCESS = 0
37+
ERROR = -1
38+
39+
40+
def init_logger():
41+
"""Initialise the logger."""
42+
userlog.setLevel(logging.INFO)
43+
userlog.addHandler(
44+
logging.FileHandler(
45+
os.path.join(os.getcwd(), 'scancode-evaluate.log'), mode='w'
46+
)
47+
)
48+
49+
50+
def path_leaf(path):
51+
"""Return the leaf of a path."""
52+
head, tail = os.path.split(path)
53+
# Ensure the correct file name is returned if the file ends with a slash
54+
return tail or os.path.basename(head)
55+
56+
57+
def has_permissive_text_in_scancode_output(scancode_output_data_file_licenses):
58+
"""Returns true if at list one license in the scancode output is permissive."""
59+
return any(
60+
scancode_output_data_file_license['category'] == 'Permissive'
61+
for scancode_output_data_file_license in scancode_output_data_file_licenses
62+
)
63+
64+
65+
def has_spdx_text_in_scancode_output(scancode_output_data_file_licenses):
66+
"""Returns true if at least one license in the scancode output has the spdx identifier."""
67+
return any(
68+
'spdx' in scancode_output_data_file_license['matched_rule']['identifier']
69+
for scancode_output_data_file_license in scancode_output_data_file_licenses
70+
)
71+
72+
73+
def has_spdx_text_in_analysed_file(scanned_file_content):
74+
"""Returns true if the file analysed by ScanCode contains SPDX identifier."""
75+
return bool(re.findall("SPDX-License-Identifier:?", scanned_file_content))
76+
77+
78+
def license_check(scancode_output_path):
79+
"""Check licenses in the scancode json file for specified directory.
4580
4681
This function does not verify if file exists, should be done prior the call.
4782
4883
Args:
49-
directory_name - where scancode was run, used to scrape this from paths
50-
file - scancode json output file (output from scancode --license --json-pp)
84+
scancode_output_path: path to the scancode json output file (output from scancode --license --json-pp)
5185
5286
Returns:
5387
0 if nothing found
5488
>0 - count how many license isses found
55-
-1 if any error in file licenses found
89+
ReturnCode.ERROR.value if any error in file licenses found
5690
"""
5791

5892
offenders = []
5993
try:
60-
# find all licenses in the files, must be licensed and permissive
61-
with open(file, 'r') as scancode_output:
62-
results = json.load(scancode_output)
63-
except ValueError:
64-
userlog.warning("JSON could not be decoded")
65-
return -1
66-
67-
try:
68-
for file in results['files']:
69-
license_offender = {}
70-
license_offender['file'] = file
71-
# ignore directory, not relevant here
72-
if license_offender['file']['type'] == 'directory':
73-
continue
74-
if not license_offender['file']['licenses']:
75-
license_offender['reason'] = MISSING_LICENSE_TEXT
76-
offenders.append(license_offender.copy())
94+
with open(scancode_output_path, 'r') as read_file:
95+
scancode_output_data = json.load(read_file)
96+
except json.JSONDecodeError as jex:
97+
userlog.warning("JSON could not be decoded, Invalid JSON in body: %s", jex)
98+
return ReturnCode.ERROR.value
99+
100+
if 'files' not in scancode_output_data:
101+
userlog.warning("Missing `files` attribute in %s" % (scancode_output_path))
102+
return ReturnCode.ERROR.value
103+
104+
for scancode_output_data_file in scancode_output_data['files']:
105+
if scancode_output_data_file['type'] != 'file':
106+
continue
107+
108+
if not scancode_output_data_file['licenses']:
109+
scancode_output_data_file['fail_reason'] = MISSING_LICENSE_TEXT
110+
offenders.append(scancode_output_data_file)
111+
# check the next file in the scancode output
112+
continue
113+
114+
if not has_permissive_text_in_scancode_output(scancode_output_data_file['licenses']):
115+
scancode_output_data_file['fail_reason'] = MISSING_PERMISSIVE_LICENSE_TEXT
116+
offenders.append(scancode_output_data_file)
117+
118+
if not has_spdx_text_in_scancode_output(scancode_output_data_file['licenses']):
119+
# Scancode does not recognize license notice in Python file headers.
120+
# Issue: https://github.com/nexB/scancode-toolkit/issues/1913
121+
# Therefore check if the file tested by ScanCode actually has a licence notice.
122+
file_path = os.path.abspath(scancode_output_data_file['path'])
123+
try:
124+
with open(file_path, 'r') as read_file:
125+
scanned_file_content = read_file.read()
126+
except UnicodeDecodeError:
127+
userlog.warning("Unable to look for SPDX text in `{}`:".format(file_path))
128+
# Ignore files that cannot be decoded
129+
# check the next file in the scancode output
77130
continue
78131

79-
found_spdx = spdx_check(offenders, license_offender)
80-
81-
if not found_spdx:
82-
try:
83-
# Issue reported here https://github.com/nexB/scancode-toolkit/issues/1913
84-
# We verify here if SPDX is not really there as SDPX is part of the license text
85-
# scancode has some problems detecting it properly
86-
with open(os.path.join(os.path.abspath(license_offender['file']['path'])), 'r') as spdx_file_check:
87-
filetext = spdx_file_check.read()
88-
matches = re.findall("SPDX-License-Identifier:?", filetext)
89-
if matches:
90-
continue
91-
license_offender['reason'] = MISSING_SPDX_TEXT
92-
offenders.append(license_offender.copy())
93-
except UnicodeDecodeError:
94-
# not valid file for license check
95-
continue
96-
except KeyError:
97-
userlog.warning("Invalid scancode json file")
98-
return -1
132+
if not has_spdx_text_in_analysed_file(scanned_file_content):
133+
scancode_output_data_file['fail_reason'] = MISSING_SPDX_TEXT
134+
offenders.append(scancode_output_data_file)
99135

100136
if offenders:
101137
userlog.warning("Found files with missing license details, please review and fix")
102138
for offender in offenders:
103-
userlog.warning("File: " + offender['file']['path'][len(directory_name):] + " " + "reason: " + offender['reason'])
139+
userlog.warning("File: %s reason: %s" % (path_leaf(offender['path']), offender['fail_reason']))
104140
return len(offenders)
105141

106142

107-
def spdx_check(offenders, license_offender):
108-
""" Parse through list of licenses to determine whether licenses are permissive
109-
@input list of offender, individual offender dict
110-
@output none
111-
"""
112-
found_spdx = False
113-
# iterate through licenses, stop once permissive license has been found
114-
for i in range(len(license_offender['file']['licenses'])):
115-
# is any of the licenses permissive ?
116-
if license_offender['file']['licenses'][i]['category'] == 'Permissive':
117-
# confirm that it has spdx license key
118-
if license_offender['file']['licenses'][i]['matched_rule']['identifier'].find("spdx") != -1:
119-
found_spdx = True
120-
# if no spdx found return anyway
121-
return found_spdx
122-
# otherwise file is missing permissive license
123-
license_offender['reason'] = MISSING_PERMISIVE_LICENSE_TEXT
124-
offenders.append(license_offender.copy())
125-
126-
# missing spdx and permissive license
127-
return found_spdx
128-
129143
def parse_args():
130-
parser = argparse.ArgumentParser(
131-
description="License check.")
132-
parser.add_argument('-f', '--file',
133-
help="scancode-toolkit output json file")
134-
parser.add_argument('-d', '--directory_name', default="SCANCODE",
135-
help='Directory name where are files being checked')
144+
"""Parse command line arguments."""
145+
parser = argparse.ArgumentParser(description="License check.")
146+
parser.add_argument(
147+
'scancode_output_path',
148+
help="scancode-toolkit output json file"
149+
)
136150
return parser.parse_args()
137151

138152

139153
if __name__ == "__main__":
154+
init_logger()
140155
args = parse_args()
141-
if args.file and os.path.isfile(args.file):
142-
count = license_check(args.directory_name, args.file)
143-
if count == 0:
144-
sys.exit(0)
145-
else:
146-
sys.exit(-1)
156+
if os.path.isfile(args.scancode_output_path):
157+
sys.exit(
158+
ReturnCode.SUCCESS.value
159+
if license_check(args.scancode_output_path) == 0
160+
else ReturnCode.ERROR.value
161+
)
147162
else:
148163
userlog.warning("Could not find the scancode json file")
149-
sys.exit(-1)
164+
sys.exit(ReturnCode.ERROR.value)

0 commit comments

Comments
 (0)