From 07dbd0a37bdb9d2bdb6259f3a94488fbc87badc1 Mon Sep 17 00:00:00 2001 From: Achraf Maghous Date: Wed, 25 Jun 2025 11:48:30 +0100 Subject: [PATCH] feat: add Dockerfile analysis for build command detection Changes: -Function find_dockerfile_from_job: handles finding Dockerfile inside workflow in 2 cases of workflow jobs: -run and -uses. -Simple DockerNode class, so far it stores mainly the dockerfile path retrieved from workflow -Parsing Dockerfile using dockerfile-parse and RUN instruction commands using bashparser.py -Parsing and storing build commands found in Dockerfiles Signed-off-by: Achraf Maghous --- pyproject.toml | 2 + .../checks/build_script_check.py | 35 +++ .../ci_service/base_ci_service.py | 18 ++ .../ci_service/github_actions/analyzer.py | 282 +++++++++++++++++- .../github_actions/github_actions_ci.py | 50 +++- 5 files changed, 374 insertions(+), 13 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 74705364b..ce77300f3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,7 @@ dependencies = [ "problog >= 2.2.6,<3.0.0", "cryptography >=44.0.0,<45.0.0", "semgrep == 1.113.0", + "dockerfile-parse >= 2.0.1" ] keywords = [] # https://pypi.org/classifiers/ @@ -79,6 +80,7 @@ dev = [ "pylint >=3.0.3,<4.0.0", "cyclonedx-bom >=4.0.0,<5.0.0", "types-beautifulsoup4 >= 4.12.0,<5.0.0", + "types-dockerfile-parse >= 2.0.0" ] docs = [ "sphinx >=8.0.0,<9.0.0", diff --git a/src/macaron/slsa_analyzer/checks/build_script_check.py b/src/macaron/slsa_analyzer/checks/build_script_check.py index ccd61cca1..ebfcd62b3 100644 --- a/src/macaron/slsa_analyzer/checks/build_script_check.py +++ b/src/macaron/slsa_analyzer/checks/build_script_check.py @@ -107,12 +107,15 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: # we parse bash scripts that are reachable through CI only. result_tables: list[CheckFacts] = [] ci_services = ctx.dynamic_data["ci_services"] + for tool in build_tools: for ci_info in ci_services: ci_service: BaseCIService = ci_info["service"] # Checking if a CI service is discovered for this repo. if isinstance(ci_service, NoneCIService): continue + + # Process regular workflow build commands try: for build_command in ci_service.get_build_tool_commands( callgraph=ci_info["callgraph"], build_tool=tool @@ -148,6 +151,38 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: except CallGraphError as error: logger.debug(error) + # Process Docker build commands if the CI service has the method + if hasattr(ci_service, "get_docker_build_commands"): + try: + for build_command in ci_service.get_docker_build_commands( + callgraph=ci_info["callgraph"], build_tool=tool + ): + logger.debug("Processing Docker build command %s", build_command) + # For Dockerfile, link to the Dockerfile itself + relative_path = os.path.relpath(build_command["ci_path"], ctx.component.repository.fs_path) + trigger_link = ci_service.api_client.get_file_link( + ctx.component.repository.full_name, + ctx.component.repository.commit_sha, + relative_path, + ) + logger.debug("Trigger link for Docker build command: %s", trigger_link) + + result_tables.append( + BuildScriptFacts( + build_tool_name=tool.name, + ci_service_name=ci_service.name, + build_trigger=trigger_link, + language=build_command["language"], + language_distributions=None, + language_versions=None, + language_url=None, + build_tool_command=tool.serialize_to_json(build_command["command"]), + confidence=Confidence.HIGH, + ) + ) + except CallGraphError as error: + logger.debug(error) + return CheckResultData(result_tables=result_tables, result_type=CheckResultType.PASSED) diff --git a/src/macaron/slsa_analyzer/ci_service/base_ci_service.py b/src/macaron/slsa_analyzer/ci_service/base_ci_service.py index adaa3ce95..a6372c189 100644 --- a/src/macaron/slsa_analyzer/ci_service/base_ci_service.py +++ b/src/macaron/slsa_analyzer/ci_service/base_ci_service.py @@ -280,6 +280,24 @@ def get_third_party_configurations(self) -> list[str]: """ return [] + def get_docker_build_commands(self, callgraph: CallGraph, build_tool: BaseBuildTool) -> Iterable[BuildToolCommand]: + """ + Traverse the callgraph and find all the reachable Docker build commands. + + Parameters + ---------- + callgraph: CallGraph + The callgraph reachable from the CI workflows. + + Yields + ------ + BuildToolCommand + The object that contains the Docker build command as well useful contextual information. + """ + # By default we assume that there is no Docker build command available for a CI service. + # Each CI service should override this method if a Docker build command is generated for it. + raise CallGraphError("There is no Docker build command for this CI service.") + class NoneCIService(BaseCIService): """This class can be used to initialize an empty CI service.""" diff --git a/src/macaron/slsa_analyzer/ci_service/github_actions/analyzer.py b/src/macaron/slsa_analyzer/ci_service/github_actions/analyzer.py index 4565c2098..16b9afaad 100644 --- a/src/macaron/slsa_analyzer/ci_service/github_actions/analyzer.py +++ b/src/macaron/slsa_analyzer/ci_service/github_actions/analyzer.py @@ -11,6 +11,8 @@ from enum import Enum from typing import Any, TypeGuard, cast +from dockerfile_parse import DockerfileParser + from macaron.code_analyzer.call_graph import BaseNode from macaron.config.global_config import global_config from macaron.errors import CallGraphError, GitHubActionsValueError, ParseError @@ -23,6 +25,7 @@ Job, NormalJob, ReusableWorkflowCallJob, + RunStep, Step, Workflow, is_action_step, @@ -120,6 +123,39 @@ def __str__(self) -> str: return f"GitHubJobNode({self.name})" +class DockerNode(BaseNode): + """This class represents a callgraph node for when a Dockerfile is used as a build tool.""" + + def __init__( + self, + caller: BaseNode, + dockerfile_path: str, + node_id: str | None = None, + ) -> None: + """Initialize instance. + + Parameters + ---------- + caller : GithubWorkflowNode + The caller node. + build_tools_in_dockerfile : list + The list of build tools found in the Dockerfile. + node_id : str | None + The unique identifier of a node in the callgraph. + dockerfile_path : str | None + The path to the Dockerfile. + """ + super().__init__( + caller=caller, + node_id=node_id, + ) + self.dockerfile_path = dockerfile_path + # Add this node to caller's callee list if not already added + if caller and self not in caller.callee: + caller.add_callee(self) + logger.info("DockerNode successfully created and added to the caller's callee list.") + + def is_parsed_obj_workflow( parsed_obj: Workflow | Identified[ReusableWorkflowCallJob] | ActionStep, ) -> TypeGuard[Workflow]: @@ -277,16 +313,117 @@ def find_language_setup_action(job_node: GitHubJobNode, lang_name: BuildLanguage return None -def build_call_graph_from_node(node: GitHubWorkflowNode, repo_path: str) -> None: - """Analyze the GitHub Actions node to build the call graph. +def find_dockerfile_from_job(job_node: GitHubJobNode, repo_path: str) -> str | None: + """ + Find the Dockerfile used in a GitHub Actions job. Parameters ---------- - node : GitHubWorkflowNode - The node for a single GitHub Actions workflow. + job_node: GitHubJobNode + The target GitHub Actions job node. repo_path: str - The file system path to the repo. + The path to the target repository. + + Returns + ------- + str | None + The path to the Dockerfile or None if not found. """ + logger.info("Finding Dockerfile in job node: %s", job_node.name) + # Get steps directly from the job node's parsed object + steps = job_node.parsed_obj.obj.get("steps", []) + if isinstance(steps, list): + for step in steps: + # Handle 'run' steps with docker build command + if not is_action_step(step) and "run" in step: + run_cmd = step["run"] + if "docker build" in run_cmd: + # Extract --file or -f argument + match = re.search(r"(?:--file|-f)\s+([^\s]+)", run_cmd) + if match: + dockerfile_path = match.group(1) + # Check if the Dockerfile path is absolute or relative + logger.debug("dockerfile_path in run step: %s", dockerfile_path) + return ( + os.path.join(repo_path, dockerfile_path) + if not os.path.isabs(dockerfile_path) + else dockerfile_path + ) + # Default to 'Dockerfile' in the build context + context_match = re.search(r"docker build\s+([^\s]+)", run_cmd) + context_path = context_match.group(1) if context_match else "." + dockerfile_path = os.path.join(repo_path, context_path, "Dockerfile") + return dockerfile_path + + # Handle 'uses' steps with docker-related actions + if "uses" in step: + uses_action = step["uses"] + # Check for docker/build-push-action or similar + if any( + docker_action in uses_action + for docker_action in [ + "docker/build-push-action", + "docker/setup-buildx-action", + "docker-build-push", # Variations I found the most common + ] + ): + # Check if there's a 'with' section + if "with" in step: + with_section = step["with"] + + # Check for 'file' parameter (Dockerfile path) + if "file" in with_section: + dockerfile_path = with_section["file"] + return ( + os.path.join(repo_path, dockerfile_path) + if not os.path.isabs(dockerfile_path) + else dockerfile_path + ) + + # Check for 'context' parameter (might have Dockerfile in that directory) + if "context" in with_section and "file" not in with_section: + context_path = with_section["context"] + # Default to Dockerfile in the context directory + dockerfile_path = os.path.join(repo_path, context_path, "Dockerfile") + if os.path.exists(dockerfile_path): + return dockerfile_path + + # If no file specified, check for default Dockerfile + default_dockerfile = os.path.join(repo_path, "Dockerfile") + if os.path.exists(default_dockerfile): + logger.debug("Using default Dockerfile location") + return default_dockerfile + + return None + + +def parse_run_commands(dockerfile_path: str) -> list[str]: + """Parse the RUN commands from a Dockerfile. + + Parameters + ---------- + dockerfile_path: str + The path to the Dockerfile. + + Returns + ------- + list[str] + A list of RUN commands found in the Dockerfile. + """ + try: + run_cmds = [] + with open(dockerfile_path, encoding="utf-8") as dockerfile: + dfp = DockerfileParser(fileobj=dockerfile) + for instruction in dfp.structure: + if instruction["instruction"] == "RUN": + run_cmds.append(instruction["value"]) + return run_cmds + except Exception as error: + raise CallGraphError(f"Error parsing Dockerfile at {dockerfile_path}: {error}") from error + + +def build_call_graph_from_node(node: GitHubWorkflowNode, repo_path: str) -> None: + """Analyze the GitHub Actions node to build the call graph.""" if not is_parsed_obj_workflow(node.parsed_obj): return jobs = node.parsed_obj["jobs"] @@ -296,13 +433,102 @@ def build_call_graph_from_node(node: GitHubWorkflowNode, repo_path: str) -> None node.add_callee(job_node) if is_normal_job(job): - # Add third-party workflows. - steps = job.get("steps") - if steps is None: + # Process steps + steps = job_node.parsed_obj.obj.get("steps") + if not isinstance(steps, list): continue - for step in steps: + + for step_idx, step in enumerate(steps): + # First check if this step uses Docker + dockerfile_path = None + step_id = step.get("id", f"step_{step_idx}") + + # Check for Docker usage in this specific step + if "run" in step: + run_cmd = step["run"] + if "docker build" in run_cmd: + # Extract Dockerfile path from docker build command + match = re.search(r"(?:--file|-f)\s+([^\s]+)", run_cmd) + if match: + dockerfile_path = match.group(1) + dockerfile_path = ( + os.path.join(repo_path, dockerfile_path) + if not os.path.isabs(dockerfile_path) + else dockerfile_path + ) + else: + # Default to 'Dockerfile' in the build context + context_match = re.search(r"docker build\s+([^\s]+)", run_cmd) + context_path = context_match.group(1) if context_match else "." + dockerfile_path = os.path.join(repo_path, context_path, "Dockerfile") + + elif "uses" in step: + uses_action = step["uses"] + # Check for docker-related actions + if any( + docker_action in uses_action + for docker_action in [ + "docker/build-push-action", + "docker/setup-buildx-action", + "docker-build-push", + ] + ): + if "with" in step: + with_section = step["with"] + if "file" in with_section: + dockerfile_path = ( + with_section["file"] + if isinstance(with_section, dict) and "file" in with_section + else "" + ) + dockerfile_path = ( + os.path.join(repo_path, str(dockerfile_path)) + if not os.path.isabs(str(dockerfile_path)) + else dockerfile_path + ) + + # If we found a Dockerfile process it + if dockerfile_path: + + # Create a DockerNode for this step + docker_node = DockerNode( + node_id=f"{job_name}_{step_id}_docker", + caller=job_node, + dockerfile_path=dockerfile_path, + ) + if docker_node: + job_node.add_callee(docker_node) + logger.info("Created DockerNode with id: %s", docker_node.node_id) + + # Parse RUN commands from Dockerfile + try: + run_cmds = parse_run_commands(dockerfile_path) + logger.info("RUN commands found in Dockerfile %s", run_cmd) + + for run_cmd in run_cmds: + try: + # Create a minimal step AST that contains the run command + docker_step_ast = RunStep(run=run_cmd) + + docker_bash_node = create_bash_node( + name="Dockerfile-RUN", + node_id=f"{job_name}_{step_id}_docker_run_", + node_type=BashScriptType.INLINE, + source_path=dockerfile_path, + ci_step_ast=docker_step_ast, + repo_path=repo_path, + caller=docker_node, + recursion_depth=0, + ) + docker_node.add_callee(docker_bash_node) + except CallGraphError as error: + logger.error("Error creating BashNode for Dockerfile RUN command %s", error) + except CallGraphError as error: + logger.error("Error parsing Dockerfile at %s: %s", dockerfile_path, error) + + # Now handle the regular step processing if is_action_step(step): - # TODO: change source_path for external workflows. + # External action that's not Docker-related (or is Docker but we already handled it) action_name = step["uses"] external_node = GitHubWorkflowNode( name=action_name, @@ -340,11 +566,41 @@ def build_call_graph_from_node(node: GitHubWorkflowNode, repo_path: str) -> None caller=job_node, recursion_depth=0, ) + job_node.add_callee(callee) + + # Check if this step uses Docker build + run_cmd = str(step.get("run", "")) + if "docker build" in run_cmd: + # Find the Dockerfile path from the docker build command + dockerfile_path = None + match = re.search(r"(?:--file|-f)\s+([^\s]+)", run_cmd) + if match: + dockerfile_path = match.group(1) + dockerfile_path = ( + os.path.join(repo_path, dockerfile_path) + if not os.path.isabs(dockerfile_path) + else dockerfile_path + ) + else: + # Default to 'Dockerfile' in the build context + context_match = re.search(r"docker build\s+([^\s]+)", run_cmd) + context_path = context_match.group(1) if context_match else "." + dockerfile_path = os.path.join(repo_path, context_path, "Dockerfile") + + # If Dockerfile exists, parse it + if dockerfile_path and os.path.exists(dockerfile_path): + logger.info("Found Dockerfile at %s", dockerfile_path) + docker_node = DockerNode( + node_id=f"{job_name}_{node_id}_docker" if node_id else f"{job_name}_docker", + caller=job_node, + dockerfile_path=dockerfile_path, + ) + if docker_node: + logger.info("Adding DockerNode for Dockerfile %s", dockerfile_path) + job_node.add_callee(docker_node) except CallGraphError as error: logger.debug(error) continue - job_node.add_callee(callee) - elif is_reusable_workflow_call_job(job): workflow_call_job_with_id = Identified[ReusableWorkflowCallJob](job_name, job) # Add reusable workflows. @@ -360,6 +616,8 @@ def build_call_graph_from_node(node: GitHubWorkflowNode, repo_path: str) -> None reusable_node.model = create_third_party_action_model(reusable_node) job_node.add_callee(reusable_node) + node.add_callee(job_node) + def build_call_graph_from_path(root: BaseNode, workflow_path: str, repo_path: str, macaron_path: str = "") -> BaseNode: """Build the call Graph for GitHub Actions workflows. diff --git a/src/macaron/slsa_analyzer/ci_service/github_actions/github_actions_ci.py b/src/macaron/slsa_analyzer/ci_service/github_actions/github_actions_ci.py index 43c4e3f0e..f3cfb5b08 100644 --- a/src/macaron/slsa_analyzer/ci_service/github_actions/github_actions_ci.py +++ b/src/macaron/slsa_analyzer/ci_service/github_actions/github_actions_ci.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module analyzes GitHub Actions CI.""" @@ -18,6 +18,7 @@ from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool, BuildToolCommand from macaron.slsa_analyzer.ci_service.base_ci_service import BaseCIService from macaron.slsa_analyzer.ci_service.github_actions.analyzer import ( + DockerNode, GitHubJobNode, GitHubWorkflowNode, GitHubWorkflowType, @@ -706,3 +707,50 @@ def get_third_party_configurations(self) -> list[str]: The list of third-party CI configuration files """ return self.third_party_configurations + + def get_docker_build_commands(self, callgraph: CallGraph, build_tool: BaseBuildTool) -> Iterable[BuildToolCommand]: + """Traverse the callgraph and find all Docker RUN commands that use build tools. + + Parameters + ---------- + callgraph: CallGraph + The callgraph reachable from the CI workflows. + build_tool: BaseBuildTool + The corresponding build tool for which shell commands need to be detected. + + Yields + ------ + BuildToolCommand + The object that contains the build command from Dockerfile RUN instructions. + """ + for node in callgraph.bfs(): + # Look for DockerNode instances + if isinstance(node, DockerNode) and hasattr(node, "dockerfile_path"): + dockerfile_path = node.dockerfile_path + + # Find the parent workflow for context + workflow_node = None + parent = node.caller + while parent: + if isinstance(parent, GitHubWorkflowNode): + workflow_node = parent + break + parent = parent.caller if hasattr(parent, "caller") else None + + # Check all BashNode children of this DockerNode + for child in node.callee: + if isinstance(child, BashNode): + # Check each command in the bash node + for cmd in child.parsed_bash_obj.get("commands", []): + if build_tool.is_build_command(cmd): + yield BuildToolCommand( + ci_path=dockerfile_path, + command=cmd, + step_node=child, + language=build_tool.language, + language_versions=None, + language_distributions=None, + language_url=None, + reachable_secrets=[], + events=get_ci_events(workflow_node) if workflow_node else [], + )