From 78a3d93ffdebd9a384b3085ccde0aaf74062067b Mon Sep 17 00:00:00 2001 From: Ting Lu Date: Fri, 26 Apr 2024 09:56:41 -0700 Subject: [PATCH 01/33] try setting MAX_JOBS=4 for oom in arm wheel --- aarch64_linux/aarch64_wheel_ci_build.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aarch64_linux/aarch64_wheel_ci_build.py b/aarch64_linux/aarch64_wheel_ci_build.py index e0b34c24b..1761d2276 100755 --- a/aarch64_linux/aarch64_wheel_ci_build.py +++ b/aarch64_linux/aarch64_wheel_ci_build.py @@ -201,7 +201,7 @@ def parse_arguments(): branch = "master" print("Building PyTorch wheel") - build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 " + build_vars = "MAX_JOBS=4 CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 " os.system("python setup.py clean") override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION") From 160daf376219476acede59aecdd5a719102de2fa Mon Sep 17 00:00:00 2001 From: Ting Lu Date: Sun, 5 May 2024 00:55:03 -0700 Subject: [PATCH 02/33] change to desired_cuda --- aarch64_linux/aarch64_ci_build.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aarch64_linux/aarch64_ci_build.sh b/aarch64_linux/aarch64_ci_build.sh index 6d9a2f6b0..64632ff13 100644 --- a/aarch64_linux/aarch64_ci_build.sh +++ b/aarch64_linux/aarch64_ci_build.sh @@ -26,8 +26,8 @@ cd / git config --global --add safe.directory /pytorch pip install -r /pytorch/requirements.txt pip install auditwheel -if [ -n "$GPU_ARCH_VERSION" ]; then - echo "BASE_CUDA_VERSION is set to: $GPU_ARCH_VERSION" +if [ -n "$DESIRED_CUDA" ]; then + echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA" python /builder/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda else echo "BASE_CUDA_VERSION is not set." From 78dd24d3faab9e154807b372e115cbb40984743d Mon Sep 17 00:00:00 2001 From: Ting Lu Date: Sun, 5 May 2024 07:26:15 -0700 Subject: [PATCH 03/33] change desired_cuda check --- aarch64_linux/aarch64_ci_build.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/aarch64_linux/aarch64_ci_build.sh b/aarch64_linux/aarch64_ci_build.sh index 64632ff13..5451df2b6 100644 --- a/aarch64_linux/aarch64_ci_build.sh +++ b/aarch64_linux/aarch64_ci_build.sh @@ -26,10 +26,10 @@ cd / git config --global --add safe.directory /pytorch pip install -r /pytorch/requirements.txt pip install auditwheel -if [ -n "$DESIRED_CUDA" ]; then +if [ "$DESIRED_CUDA" = "cpu" ]; then + echo "BASE_CUDA_VERSION is not set. Building cpu wheel." + python /builder/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn +else echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA" python /builder/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda -else - echo "BASE_CUDA_VERSION is not set." - python /builder/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn fi \ No newline at end of file From 6eed27215a8a48afd5b006ee38da7067edfb603a Mon Sep 17 00:00:00 2001 From: Ting Lu Date: Mon, 6 May 2024 13:19:07 -0700 Subject: [PATCH 04/33] change path --- aarch64_linux/aarch64_wheel_ci_build.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aarch64_linux/aarch64_wheel_ci_build.py b/aarch64_linux/aarch64_wheel_ci_build.py index 1761d2276..b34c5cd25 100755 --- a/aarch64_linux/aarch64_wheel_ci_build.py +++ b/aarch64_linux/aarch64_wheel_ci_build.py @@ -202,7 +202,7 @@ def parse_arguments(): print("Building PyTorch wheel") build_vars = "MAX_JOBS=4 CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 " - os.system("python setup.py clean") + os.system("cd /pytorch; python setup.py clean") override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION") if override_package_version is not None: From fa2a485af03cd86e81bd8102f3fb6ae537916537 Mon Sep 17 00:00:00 2001 From: Ting Lu Date: Tue, 7 May 2024 07:25:37 -0700 Subject: [PATCH 05/33] remove libopenblas file --- aarch64_linux/aarch64_wheel_ci_build.py | 1 - 1 file changed, 1 deletion(-) diff --git a/aarch64_linux/aarch64_wheel_ci_build.py b/aarch64_linux/aarch64_wheel_ci_build.py index b34c5cd25..aa6aaabc4 100755 --- a/aarch64_linux/aarch64_wheel_ci_build.py +++ b/aarch64_linux/aarch64_wheel_ci_build.py @@ -122,7 +122,6 @@ def update_wheel(wheel_path) -> None: "/usr/local/cuda/lib64/libcudnn_cnn_train.so.8", "/usr/local/cuda/lib64/libcudnn_ops_infer.so.8", "/usr/local/cuda/lib64/libcudnn_ops_train.so.8", - "/opt/conda/envs/aarch64_env/lib/libopenblas.so.0", "/opt/conda/envs/aarch64_env/lib/libgfortran.so.5", "/opt/conda/envs/aarch64_env/lib/libgomp.so.1", "/acl/build/libarm_compute.so", From 19feff41ee5e58a62f4f09b59bb219341adb771a Mon Sep 17 00:00:00 2001 From: Ting Lu Date: Tue, 7 May 2024 16:44:52 -0700 Subject: [PATCH 06/33] test only hopper for quicker tat --- aarch64_linux/aarch64_wheel_ci_build.py | 3 +-- manywheel/build_cuda.sh | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/aarch64_linux/aarch64_wheel_ci_build.py b/aarch64_linux/aarch64_wheel_ci_build.py index aa6aaabc4..0c653184c 100755 --- a/aarch64_linux/aarch64_wheel_ci_build.py +++ b/aarch64_linux/aarch64_wheel_ci_build.py @@ -122,7 +122,6 @@ def update_wheel(wheel_path) -> None: "/usr/local/cuda/lib64/libcudnn_cnn_train.so.8", "/usr/local/cuda/lib64/libcudnn_ops_infer.so.8", "/usr/local/cuda/lib64/libcudnn_ops_train.so.8", - "/opt/conda/envs/aarch64_env/lib/libgfortran.so.5", "/opt/conda/envs/aarch64_env/lib/libgomp.so.1", "/acl/build/libarm_compute.so", "/acl/build/libarm_compute_graph.so", @@ -200,7 +199,7 @@ def parse_arguments(): branch = "master" print("Building PyTorch wheel") - build_vars = "MAX_JOBS=4 CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 " + build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 " os.system("cd /pytorch; python setup.py clean") override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION") diff --git a/manywheel/build_cuda.sh b/manywheel/build_cuda.sh index 4fc1ed278..f10aa3bc3 100644 --- a/manywheel/build_cuda.sh +++ b/manywheel/build_cuda.sh @@ -60,7 +60,7 @@ cuda_version_nodot=$(echo $CUDA_VERSION | tr -d '.') TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6" case ${CUDA_VERSION} in 12.4) - TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0" + TORCH_CUDA_ARCH_LIST="9.0" EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON") ;; 12.1) From 3931c11781518a3340d033daf225f27b82d062eb Mon Sep 17 00:00:00 2001 From: Ting Lu Date: Wed, 8 May 2024 19:17:41 -0700 Subject: [PATCH 07/33] add back max_jobs=4 --- aarch64_linux/aarch64_wheel_ci_build.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aarch64_linux/aarch64_wheel_ci_build.py b/aarch64_linux/aarch64_wheel_ci_build.py index 0c653184c..8d0882098 100755 --- a/aarch64_linux/aarch64_wheel_ci_build.py +++ b/aarch64_linux/aarch64_wheel_ci_build.py @@ -199,7 +199,7 @@ def parse_arguments(): branch = "master" print("Building PyTorch wheel") - build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 " + build_vars = "MAX_JOBS=4 CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 " os.system("cd /pytorch; python setup.py clean") override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION") From f2f8250ef682b0efd4d8d7dc55131d9aad1c9477 Mon Sep 17 00:00:00 2001 From: Ting Lu Date: Wed, 8 May 2024 20:46:22 -0700 Subject: [PATCH 08/33] cherrypick #1808 --- aarch64_linux/aarch64_wheel_ci_build.py | 2 +- manywheel/build_cuda.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/aarch64_linux/aarch64_wheel_ci_build.py b/aarch64_linux/aarch64_wheel_ci_build.py index 8d0882098..0c653184c 100755 --- a/aarch64_linux/aarch64_wheel_ci_build.py +++ b/aarch64_linux/aarch64_wheel_ci_build.py @@ -199,7 +199,7 @@ def parse_arguments(): branch = "master" print("Building PyTorch wheel") - build_vars = "MAX_JOBS=4 CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 " + build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 " os.system("cd /pytorch; python setup.py clean") override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION") diff --git a/manywheel/build_cuda.sh b/manywheel/build_cuda.sh index f10aa3bc3..2837b3793 100644 --- a/manywheel/build_cuda.sh +++ b/manywheel/build_cuda.sh @@ -155,7 +155,7 @@ if [[ $CUDA_VERSION == "12.1" || $CUDA_VERSION == "12.4" ]]; then "/usr/local/cuda/lib64/libcudart.so.12" "/usr/local/cuda/lib64/libnvToolsExt.so.1" "/usr/local/cuda/lib64/libnvrtc.so.12" - "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.1" + "/usr/local/cuda/lib64/libnvrtc-builtins.so" ) DEPS_SONAME+=( "libcudnn_adv_infer.so.8" @@ -170,7 +170,7 @@ if [[ $CUDA_VERSION == "12.1" || $CUDA_VERSION == "12.4" ]]; then "libcudart.so.12" "libnvToolsExt.so.1" "libnvrtc.so.12" - "libnvrtc-builtins.so.12.1" + "libnvrtc-builtins.so" ) else echo "Using nvidia libs from pypi." From 71bc4f283cae3f13659cde8955469f7b2c49ee40 Mon Sep 17 00:00:00 2001 From: Ting Lu Date: Wed, 8 May 2024 21:30:42 -0700 Subject: [PATCH 09/33] need maxjobs=4 --- aarch64_linux/aarch64_wheel_ci_build.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aarch64_linux/aarch64_wheel_ci_build.py b/aarch64_linux/aarch64_wheel_ci_build.py index 0c653184c..8d0882098 100755 --- a/aarch64_linux/aarch64_wheel_ci_build.py +++ b/aarch64_linux/aarch64_wheel_ci_build.py @@ -199,7 +199,7 @@ def parse_arguments(): branch = "master" print("Building PyTorch wheel") - build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 " + build_vars = "MAX_JOBS=4 CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 " os.system("cd /pytorch; python setup.py clean") override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION") From 5dcd9dd1dfb8bde8b6b8caf01126fd3b7eed0684 Mon Sep 17 00:00:00 2001 From: Ting Lu Date: Thu, 9 May 2024 08:16:42 -0700 Subject: [PATCH 10/33] fix path to copy wheel --- aarch64_linux/aarch64_wheel_ci_build.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aarch64_linux/aarch64_wheel_ci_build.py b/aarch64_linux/aarch64_wheel_ci_build.py index 8d0882098..2a9573883 100755 --- a/aarch64_linux/aarch64_wheel_ci_build.py +++ b/aarch64_linux/aarch64_wheel_ci_build.py @@ -138,7 +138,7 @@ def update_wheel(wheel_path) -> None: os.system(f"cd {folder}/tmp/; zip -r {folder}/cuda_wheel/{wheelname} *") shutil.move( f"{folder}/cuda_wheel/{wheelname}", - f"/dist/{wheelname}", + f"{folder}/{wheelname}", copy_function=shutil.copy2, ) os.system(f"rm -rf {folder}/tmp {folder}/dist/cuda_wheel/") From 3c9ff9813d903d4c76ff06894a0dd46c9a3a32c4 Mon Sep 17 00:00:00 2001 From: Ting Lu Date: Thu, 9 May 2024 08:18:16 -0700 Subject: [PATCH 11/33] fix path to rm --- aarch64_linux/aarch64_wheel_ci_build.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aarch64_linux/aarch64_wheel_ci_build.py b/aarch64_linux/aarch64_wheel_ci_build.py index 2a9573883..38fd58314 100755 --- a/aarch64_linux/aarch64_wheel_ci_build.py +++ b/aarch64_linux/aarch64_wheel_ci_build.py @@ -141,7 +141,7 @@ def update_wheel(wheel_path) -> None: f"{folder}/{wheelname}", copy_function=shutil.copy2, ) - os.system(f"rm -rf {folder}/tmp {folder}/dist/cuda_wheel/") + os.system(f"rm -rf {folder}/tmp/ {folder}/cuda_wheel/") def complete_wheel(folder: str) -> str: From 42ea49397617685fb026956cd37cfc00dff4c89f Mon Sep 17 00:00:00 2001 From: Ting Lu Date: Thu, 9 May 2024 08:23:00 -0700 Subject: [PATCH 12/33] try set max jobs to 5 as 4 is too slow --- aarch64_linux/aarch64_wheel_ci_build.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aarch64_linux/aarch64_wheel_ci_build.py b/aarch64_linux/aarch64_wheel_ci_build.py index 38fd58314..876fbfe76 100755 --- a/aarch64_linux/aarch64_wheel_ci_build.py +++ b/aarch64_linux/aarch64_wheel_ci_build.py @@ -199,7 +199,7 @@ def parse_arguments(): branch = "master" print("Building PyTorch wheel") - build_vars = "MAX_JOBS=4 CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 " + build_vars = "MAX_JOBS=5 CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 " os.system("cd /pytorch; python setup.py clean") override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION") From f670d5bfcf3f86ea7b5af179bdf5abfe33f0acf0 Mon Sep 17 00:00:00 2001 From: Ting Lu Date: Thu, 16 May 2024 08:48:30 -0700 Subject: [PATCH 13/33] cuda 9.0 for aarch64 only --- manywheel/build_cuda.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/manywheel/build_cuda.sh b/manywheel/build_cuda.sh index 2837b3793..6dc2e0f90 100644 --- a/manywheel/build_cuda.sh +++ b/manywheel/build_cuda.sh @@ -60,7 +60,11 @@ cuda_version_nodot=$(echo $CUDA_VERSION | tr -d '.') TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6" case ${CUDA_VERSION} in 12.4) - TORCH_CUDA_ARCH_LIST="9.0" + if [[ "$GPU_ARCH_TYPE" = "cuda-aarch64" ]]; then + TORCH_CUDA_ARCH_LIST="9.0" + else + TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0" + fi EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON") ;; 12.1) From 0eecef2c979dd9406dd76a770b722d5967101a72 Mon Sep 17 00:00:00 2001 From: Ting Lu Date: Fri, 17 May 2024 04:58:29 -0700 Subject: [PATCH 14/33] add libopenblas.so new location (from OpenBLAS) --- aarch64_linux/aarch64_wheel_ci_build.py | 1 + 1 file changed, 1 insertion(+) diff --git a/aarch64_linux/aarch64_wheel_ci_build.py b/aarch64_linux/aarch64_wheel_ci_build.py index 876fbfe76..2127e5d0c 100755 --- a/aarch64_linux/aarch64_wheel_ci_build.py +++ b/aarch64_linux/aarch64_wheel_ci_build.py @@ -123,6 +123,7 @@ def update_wheel(wheel_path) -> None: "/usr/local/cuda/lib64/libcudnn_ops_infer.so.8", "/usr/local/cuda/lib64/libcudnn_ops_train.so.8", "/opt/conda/envs/aarch64_env/lib/libgomp.so.1", + "/opt/OpenBLAS/lib/libopenblas.so.0", "/acl/build/libarm_compute.so", "/acl/build/libarm_compute_graph.so", "/acl/build/libarm_compute_core.so", From 3841eaf931a2701d2b77768431e3519aa7805a92 Mon Sep 17 00:00:00 2001 From: Ting Lu Date: Fri, 17 May 2024 07:09:49 -0700 Subject: [PATCH 15/33] upgrade ACL version to 24.04 (1824) --- aarch64_linux/aarch64_wheel_ci_build.py | 2 +- aarch64_linux/build_aarch64_wheel.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/aarch64_linux/aarch64_wheel_ci_build.py b/aarch64_linux/aarch64_wheel_ci_build.py index 2127e5d0c..9a9ed88e3 100755 --- a/aarch64_linux/aarch64_wheel_ci_build.py +++ b/aarch64_linux/aarch64_wheel_ci_build.py @@ -78,7 +78,7 @@ def build_ArmComputeLibrary() -> None: "clone", "https://github.com/ARM-software/ComputeLibrary.git", "-b", - "v23.08", + "v24.04", "--depth", "1", "--shallow-submodules", diff --git a/aarch64_linux/build_aarch64_wheel.py b/aarch64_linux/build_aarch64_wheel.py index 0ff286ad2..3956f0463 100755 --- a/aarch64_linux/build_aarch64_wheel.py +++ b/aarch64_linux/build_aarch64_wheel.py @@ -229,7 +229,7 @@ def build_ArmComputeLibrary(host: RemoteHost, git_clone_flags: str = "") -> None print('Building Arm Compute Library') acl_build_flags=" ".join(["debug=0", "neon=1", "opencl=0", "os=linux", "openmp=1", "cppthreads=0", "arch=armv8a", "multi_isa=1", "fixed_format_kernels=1", "build=native"]) - host.run_cmd(f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v23.08 {git_clone_flags}") + host.run_cmd(f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v24.04 {git_clone_flags}") host.run_cmd(f"cd ComputeLibrary && scons Werror=1 -j8 {acl_build_flags}") From 9f62e48810cae2a1f6a678054663fa716b1de552 Mon Sep 17 00:00:00 2001 From: Ting Lu Date: Fri, 17 May 2024 13:05:17 -0700 Subject: [PATCH 16/33] remove copy libarm_compute_core.so --- aarch64_linux/aarch64_wheel_ci_build.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/aarch64_linux/aarch64_wheel_ci_build.py b/aarch64_linux/aarch64_wheel_ci_build.py index 9a9ed88e3..1a10c8efc 100755 --- a/aarch64_linux/aarch64_wheel_ci_build.py +++ b/aarch64_linux/aarch64_wheel_ci_build.py @@ -126,7 +126,6 @@ def update_wheel(wheel_path) -> None: "/opt/OpenBLAS/lib/libopenblas.so.0", "/acl/build/libarm_compute.so", "/acl/build/libarm_compute_graph.so", - "/acl/build/libarm_compute_core.so", ] # Copy libraries to unzipped_folder/a/lib for lib_path in libs_to_copy: @@ -200,7 +199,7 @@ def parse_arguments(): branch = "master" print("Building PyTorch wheel") - build_vars = "MAX_JOBS=5 CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 " + build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 " os.system("cd /pytorch; python setup.py clean") override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION") From 374e9e18fcc7a8cc82128ec593862af3d71456d0 Mon Sep 17 00:00:00 2001 From: Ting Lu Date: Fri, 17 May 2024 19:20:10 -0700 Subject: [PATCH 17/33] still need max_jobs=5 as 6 oom --- aarch64_linux/README.md | 2 +- aarch64_linux/aarch64_wheel_ci_build.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/aarch64_linux/README.md b/aarch64_linux/README.md index 583ed4af9..4a3be5312 100644 --- a/aarch64_linux/README.md +++ b/aarch64_linux/README.md @@ -16,4 +16,4 @@ __NOTE:__ CI build is currently __EXPERMINTAL__ This app allows a person to build using AWS EC3 resources and requires AWS-CLI and Boto3 with AWS credentials to support building EC2 instances for the wheel builds. Can be used in a codebuild CD or from a local system. ### Usage -```build_aarch64_wheel.py --key-name --use-docker --python 3.8 --branch ``` +```build_aarch64_wheel.py --key-name --use-docker --python 3.8 --branch ``` \ No newline at end of file diff --git a/aarch64_linux/aarch64_wheel_ci_build.py b/aarch64_linux/aarch64_wheel_ci_build.py index 1a10c8efc..5297cf42d 100755 --- a/aarch64_linux/aarch64_wheel_ci_build.py +++ b/aarch64_linux/aarch64_wheel_ci_build.py @@ -199,7 +199,7 @@ def parse_arguments(): branch = "master" print("Building PyTorch wheel") - build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 " + build_vars = "MAX_JOBS=5 CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 " os.system("cd /pytorch; python setup.py clean") override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION") From c49a7570dac77553563243d9fab0ff08a2200de9 Mon Sep 17 00:00:00 2001 From: snadampal <87143774+snadampal@users.noreply.github.com> Date: Thu, 2 May 2024 12:11:00 -0500 Subject: [PATCH 18/33] aarch64: cd: fix issue with invoking cpu wheel build option (#1791) --- aarch64_linux/aarch64_ci_build.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/aarch64_linux/aarch64_ci_build.sh b/aarch64_linux/aarch64_ci_build.sh index 5451df2b6..dc9f25a1f 100644 --- a/aarch64_linux/aarch64_ci_build.sh +++ b/aarch64_linux/aarch64_ci_build.sh @@ -1,6 +1,8 @@ #!/bin/bash set -eux -o pipefail +GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-} + SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" source $SCRIPTPATH/aarch64_ci_setup.sh From d31681eceb335c65022e19ac6e3db0cad0c93233 Mon Sep 17 00:00:00 2001 From: Aleksei Nikiforov <103434461+AlekseiNikiforovIBM@users.noreply.github.com> Date: Fri, 3 May 2024 22:36:04 +0200 Subject: [PATCH 19/33] Update s390x builder (#1802) * Disable automatic building of s390x docker image * Update docker image and build scripts for s390x * Switch devtoolset to 13 There is a not yet investigated build failure caused by gcc 12, but it doesn't reproduce with gcc 13. * Adapt binaries check for s390x * Switch to ubuntu:24.04 for s390x * Update libgomp.so.1 path for s390x --- .github/workflows/build-manywheel-images.yml | 15 ----- check_binary.sh | 6 +- manywheel/Dockerfile_s390x | 62 +++++++++++--------- manywheel/build.sh | 2 +- manywheel/build_cpu.sh | 6 +- manywheel/build_scripts/build.sh | 55 +++++++++++------ manywheel/build_scripts/manylinux1-check.py | 2 +- 7 files changed, 82 insertions(+), 66 deletions(-) diff --git a/.github/workflows/build-manywheel-images.yml b/.github/workflows/build-manywheel-images.yml index cf7d82828..a599635f8 100644 --- a/.github/workflows/build-manywheel-images.yml +++ b/.github/workflows/build-manywheel-images.yml @@ -137,18 +137,3 @@ jobs: - name: Build Docker Image run: | manywheel/build_docker.sh - build-docker-cpu-s390x: - runs-on: linux.s390x - env: - GPU_ARCH_TYPE: cpu-s390x - steps: - - name: Checkout PyTorch - uses: actions/checkout@v3 - - name: Authenticate if WITH_PUSH - run: | - if [[ "${WITH_PUSH}" == true ]]; then - echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin - fi - - name: Build Docker Image - run: | - manywheel/build_docker.sh diff --git a/check_binary.sh b/check_binary.sh index 98a5267eb..be2b5252b 100755 --- a/check_binary.sh +++ b/check_binary.sh @@ -330,7 +330,7 @@ fi if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then echo "Checking that MKL is available" build_and_run_example_cpp check-torch-mkl -elif [[ "$(uname -m)" != "arm64" ]]; then +elif [[ "$(uname -m)" != "arm64" && "$(uname -m)" != "s390x" ]]; then if [[ "$(uname)" != 'Darwin' || "$PACKAGE_TYPE" != *wheel ]]; then if [[ "$(uname -m)" == "aarch64" ]]; then echo "Checking that MKLDNN is available on aarch64" @@ -354,7 +354,7 @@ if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then echo "Checking that XNNPACK is available" build_and_run_example_cpp check-torch-xnnpack else - if [[ "$(uname)" != 'Darwin' || "$PACKAGE_TYPE" != *wheel ]]; then + if [[ "$(uname)" != 'Darwin' || "$PACKAGE_TYPE" != *wheel ]] && [[ "$(uname -m)" != "s390x" ]]; then echo "Checking that XNNPACK is available" pushd /tmp python -c 'import torch.backends.xnnpack; exit(0 if torch.backends.xnnpack.enabled else 1)' @@ -375,7 +375,7 @@ if [[ "$OSTYPE" == "msys" ]]; then fi # Test that CUDA builds are setup correctly -if [[ "$DESIRED_CUDA" != 'cpu' && "$DESIRED_CUDA" != 'cpu-cxx11-abi' && "$DESIRED_CUDA" != *"rocm"* ]]; then +if [[ "$DESIRED_CUDA" != 'cpu' && "$DESIRED_CUDA" != 'cpu-cxx11-abi' && "$DESIRED_CUDA" != *"rocm"* && "$(uname -m)" != "s390x" ]]; then if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then build_and_run_example_cpp check-torch-cuda else diff --git a/manywheel/Dockerfile_s390x b/manywheel/Dockerfile_s390x index e30d0bea5..caa5d00bd 100644 --- a/manywheel/Dockerfile_s390x +++ b/manywheel/Dockerfile_s390x @@ -1,18 +1,15 @@ -FROM --platform=linux/s390x docker.io/redhat/ubi9 as base +FROM --platform=linux/s390x docker.io/ubuntu:24.04 as base -# earliest available version in ubi9 -ARG DEVTOOLSET_VERSION=12 - -# Language variabes -ENV LC_ALL=en_US.UTF-8 -ENV LANG=en_US.UTF-8 -ENV LANGUAGE=en_US.UTF-8 +# Language variables +ENV LC_ALL=C.UTF-8 +ENV LANG=C.UTF-8 +ENV LANGUAGE=C.UTF-8 # Installed needed OS packages. This is to support all # the binary builds (torch, vision, audio, text, data) -RUN dnf -y install redhat-release -RUN dnf -y update -RUN dnf install -y --allowerasing \ +RUN apt update ; apt upgrade -y +RUN apt install -y \ + build-essential \ autoconf \ automake \ bzip2 \ @@ -27,20 +24,19 @@ RUN dnf install -y --allowerasing \ util-linux \ wget \ which \ - xz \ + xz-utils \ less \ zstd \ - libgomp \ cmake \ - gcc-toolset-${DEVTOOLSET_VERSION}-gcc \ - gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ \ - gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran \ - gcc-toolset-${DEVTOOLSET_VERSION}-binutils - -# Ensure the expected gcc-toolset is used -ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH -ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH - + python3 \ + python3-dev \ + python3-setuptools \ + python3-yaml \ + python3-typing-extensions \ + libblas-dev \ + libopenblas-dev \ + liblapack-dev \ + libatlas-base-dev # git236+ would refuse to run git commands in repos owned by other users # Which causes version check to fail, as pytorch repo is bind-mounted into the image @@ -57,9 +53,21 @@ ADD ./common/install_openssl.sh install_openssl.sh RUN bash ./install_openssl.sh && rm install_openssl.sh ENV SSL_CERT_FILE=/opt/_internal/certs.pem +# EPEL for cmake +FROM base as patchelf +# Install patchelf +ADD ./common/install_patchelf.sh install_patchelf.sh +RUN bash ./install_patchelf.sh && rm install_patchelf.sh +RUN cp $(which patchelf) /patchelf + +FROM patchelf as python +# build python +COPY manywheel/build_scripts /build_scripts +ADD ./common/install_cpython.sh /build_scripts/install_cpython.sh +RUN bash build_scripts/build.sh && rm -r build_scripts + FROM openssl as final -# remove unncessary python versions -RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2 -RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4 -RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6 -RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6 +COPY --from=python /opt/python /opt/python +COPY --from=python /opt/_internal /opt/_internal +COPY --from=python /opt/python/cp37-cp37m/bin/auditwheel /usr/local/bin/auditwheel +COPY --from=patchelf /usr/local/bin/patchelf /usr/local/bin/patchelf diff --git a/manywheel/build.sh b/manywheel/build.sh index 43725615d..a04d05869 100755 --- a/manywheel/build.sh +++ b/manywheel/build.sh @@ -15,7 +15,7 @@ case "${GPU_ARCH_TYPE:-BLANK}" in rocm) bash "${SCRIPTPATH}/build_rocm.sh" ;; - cpu | cpu-cxx11-abi) + cpu | cpu-cxx11-abi | cpu-s390x) bash "${SCRIPTPATH}/build_cpu.sh" ;; *) diff --git a/manywheel/build_cpu.sh b/manywheel/build_cpu.sh index 4669c8a3c..24c95f14e 100755 --- a/manywheel/build_cpu.sh +++ b/manywheel/build_cpu.sh @@ -32,7 +32,11 @@ if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then LIBGOMP_PATH="/usr/lib64/libgomp.so.1" elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then - LIBGOMP_PATH="/usr/lib/x86_64-linux-gnu/libgomp.so.1" + if [[ "$(uname -m)" == "s390x" ]]; then + LIBGOMP_PATH="/usr/lib/s390x-linux-gnu/libgomp.so.1" + else + LIBGOMP_PATH="/usr/lib/x86_64-linux-gnu/libgomp.so.1" + fi fi DEPS_LIST=( diff --git a/manywheel/build_scripts/build.sh b/manywheel/build_scripts/build.sh index d139abcb6..c545ca967 100644 --- a/manywheel/build_scripts/build.sh +++ b/manywheel/build_scripts/build.sh @@ -15,22 +15,37 @@ CURL_HASH=cf34fe0b07b800f1c01a499a6e8b2af548f6d0e044dca4a29d88a4bee146d131 AUTOCONF_ROOT=autoconf-2.69 AUTOCONF_HASH=954bd69b391edc12d6a4a51a2dd1476543da5c6bbf05a95b59dc0dd6fd4c2969 -# Dependencies for compiling Python that we want to remove from -# the final image after compiling Python -PYTHON_COMPILE_DEPS="zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel db4-devel libpcap-devel xz-devel libffi-devel" - -# Libraries that are allowed as part of the manylinux1 profile -MANYLINUX1_DEPS="glibc-devel libstdc++-devel glib2-devel libX11-devel libXext-devel libXrender-devel mesa-libGL-devel libICE-devel libSM-devel ncurses-devel" - # Get build utilities MY_DIR=$(dirname "${BASH_SOURCE[0]}") source $MY_DIR/build_utils.sh -# Development tools and libraries -yum -y install bzip2 make git patch unzip bison yasm diffutils \ - automake which file cmake28 \ - kernel-devel-`uname -r` \ - ${PYTHON_COMPILE_DEPS} +if [ "$(uname -m)" != "s390x" ] ; then + # Dependencies for compiling Python that we want to remove from + # the final image after compiling Python + PYTHON_COMPILE_DEPS="zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel db4-devel libpcap-devel xz-devel libffi-devel" + + # Libraries that are allowed as part of the manylinux1 profile + MANYLINUX1_DEPS="glibc-devel libstdc++-devel glib2-devel libX11-devel libXext-devel libXrender-devel mesa-libGL-devel libICE-devel libSM-devel ncurses-devel" + + # Development tools and libraries + yum -y install bzip2 make git patch unzip bison yasm diffutils \ + automake which file cmake28 \ + kernel-devel-`uname -r` \ + ${PYTHON_COMPILE_DEPS} +else + # Dependencies for compiling Python that we want to remove from + # the final image after compiling Python + PYTHON_COMPILE_DEPS="zlib1g-dev libbz2-dev libncurses-dev libsqlite3-dev libdb-dev libpcap-dev liblzma-dev libffi-dev" + + # Libraries that are allowed as part of the manylinux1 profile + MANYLINUX1_DEPS="libglib2.0-dev libX11-dev libncurses-dev" + + # Development tools and libraries + apt install -y bzip2 make git patch unzip diffutils \ + automake which file cmake \ + linux-headers-virtual \ + ${PYTHON_COMPILE_DEPS} +fi # Install newest autoconf build_autoconf $AUTOCONF_ROOT $AUTOCONF_HASH @@ -76,12 +91,16 @@ ln -s $PY37_BIN/auditwheel /usr/local/bin/auditwheel # Clean up development headers and other unnecessary stuff for # final image -yum -y erase wireless-tools gtk2 libX11 hicolor-icon-theme \ - avahi freetype bitstream-vera-fonts \ - ${PYTHON_COMPILE_DEPS} || true > /dev/null 2>&1 -yum -y install ${MANYLINUX1_DEPS} -yum -y clean all > /dev/null 2>&1 -yum list installed +if [ "$(uname -m)" != "s390x" ] ; then + yum -y erase wireless-tools gtk2 libX11 hicolor-icon-theme \ + avahi freetype bitstream-vera-fonts \ + ${PYTHON_COMPILE_DEPS} || true > /dev/null 2>&1 + yum -y install ${MANYLINUX1_DEPS} + yum -y clean all > /dev/null 2>&1 + yum list installed +else + apt purge -y ${PYTHON_COMPILE_DEPS} || true > /dev/null 2>&1 +fi # we don't need libpython*.a, and they're many megabytes find /opt/_internal -name '*.a' -print0 | xargs -0 rm -f # Strip what we can -- and ignore errors, because this just attempts to strip diff --git a/manywheel/build_scripts/manylinux1-check.py b/manywheel/build_scripts/manylinux1-check.py index fa77ef43a..7cb62e0c0 100644 --- a/manywheel/build_scripts/manylinux1-check.py +++ b/manywheel/build_scripts/manylinux1-check.py @@ -3,7 +3,7 @@ def is_manylinux1_compatible(): # Only Linux, and only x86-64 / i686 from distutils.util import get_platform - if get_platform() not in ["linux-x86_64", "linux-i686"]: + if get_platform() not in ["linux-x86_64", "linux-i686", "linux-s390x"]: return False # Check for presence of _manylinux module From 4fcabbe91513ed6243ed18d0c74c0a96879793e1 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Mon, 6 May 2024 12:26:23 -0400 Subject: [PATCH 20/33] Fix cuda windows validations update cuda driver. (#1810) --- .github/workflows/validate-windows-binaries.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/validate-windows-binaries.yml b/.github/workflows/validate-windows-binaries.yml index fed7b1939..cef51b77f 100644 --- a/.github/workflows/validate-windows-binaries.yml +++ b/.github/workflows/validate-windows-binaries.yml @@ -127,7 +127,7 @@ jobs: printf '%s\n' ${{ toJson(inputs.release-matrix) }} > release_matrix.json source /c/Jenkins/Miniconda3/etc/profile.d/conda.sh - if [[ ${MATRIX_GPU_ARCH_VERSION} == "12.1" ]]; then + if [[ ${MATRIX_GPU_ARCH_TYPE} == "cuda" ]]; then ./windows/internal/driver_update.bat fi source ./.github/scripts/validate_binaries.sh From a8f71c070ea11da16493c88efff7bf725877564a Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Tue, 7 May 2024 11:44:59 -0400 Subject: [PATCH 21/33] Revert "aarch64: upgrade ACL version to 24.04" (#1813) This reverts commit 6b90c090ebb01d86f65493d1d609d7fadc0feab8. --- aarch64_linux/aarch64_wheel_ci_build.py | 2 +- aarch64_linux/build_aarch64_wheel.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/aarch64_linux/aarch64_wheel_ci_build.py b/aarch64_linux/aarch64_wheel_ci_build.py index 5297cf42d..2444429cb 100755 --- a/aarch64_linux/aarch64_wheel_ci_build.py +++ b/aarch64_linux/aarch64_wheel_ci_build.py @@ -78,7 +78,7 @@ def build_ArmComputeLibrary() -> None: "clone", "https://github.com/ARM-software/ComputeLibrary.git", "-b", - "v24.04", + "v23.08", "--depth", "1", "--shallow-submodules", diff --git a/aarch64_linux/build_aarch64_wheel.py b/aarch64_linux/build_aarch64_wheel.py index 3956f0463..0ff286ad2 100755 --- a/aarch64_linux/build_aarch64_wheel.py +++ b/aarch64_linux/build_aarch64_wheel.py @@ -229,7 +229,7 @@ def build_ArmComputeLibrary(host: RemoteHost, git_clone_flags: str = "") -> None print('Building Arm Compute Library') acl_build_flags=" ".join(["debug=0", "neon=1", "opencl=0", "os=linux", "openmp=1", "cppthreads=0", "arch=armv8a", "multi_isa=1", "fixed_format_kernels=1", "build=native"]) - host.run_cmd(f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v24.04 {git_clone_flags}") + host.run_cmd(f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v23.08 {git_clone_flags}") host.run_cmd(f"cd ComputeLibrary && scons Werror=1 -j8 {acl_build_flags}") From 57425f477871e0ec2425526ee930fc14981e73ae Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Tue, 7 May 2024 17:09:37 -0400 Subject: [PATCH 22/33] Don't deactivate/remove conda on linux after validation (#1814) * Don't deactivate/remove conda on linux * test --- .github/scripts/validate_binaries.sh | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/scripts/validate_binaries.sh b/.github/scripts/validate_binaries.sh index 8779e8064..bf5c15690 100755 --- a/.github/scripts/validate_binaries.sh +++ b/.github/scripts/validate_binaries.sh @@ -54,6 +54,11 @@ else ${PWD}/check_binary.sh fi + # We are only interested in CUDA tests and Python 3.8-3.11. Not all requirement libraries are available for 3.12 yet. + if [[ ${INCLUDE_TEST_OPS:-} == 'true' && ${MATRIX_GPU_ARCH_TYPE} == 'cuda' && ${MATRIX_PYTHON_VERSION} != "3.12" ]]; then + source ./.github/scripts/validate_test_ops.sh + fi + if [[ ${TARGET_OS} == 'windows' ]]; then python ./test/smoke_test/smoke_test.py ${TEST_SUFFIX} else @@ -64,13 +69,8 @@ else export PATH=${OLD_PATH} fi - # We are only interested in CUDA tests and Python 3.8-3.11. Not all requirement libraries are available for 3.12 yet. - if [[ ${INCLUDE_TEST_OPS:-} == 'true' && ${MATRIX_GPU_ARCH_TYPE} == 'cuda' && ${MATRIX_PYTHON_VERSION} != "3.12" ]]; then - source ./.github/scripts/validate_test_ops.sh - fi - - # TODO: remove if statement currently this step is timing out on linx-aarch64 - if [[ ${TARGET_OS} != 'linux-aarch64' ]]; then + # this is optional step + if [[ ${TARGET_OS} != linux* ]]; then conda deactivate conda env remove -n ${ENV_NAME} fi From 8d58e64375770cc53418711bab926175256c7f71 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Fri, 10 May 2024 12:50:47 -0400 Subject: [PATCH 23/33] Add manylinux_2_28 image (#1816) * Add manylinux_2_28 image --- .github/workflows/build-manywheel-images.yml | 15 ++ manywheel/Dockerfile_2_28 | 143 +++++++++++++++++++ manywheel/build_docker.sh | 8 ++ 3 files changed, 166 insertions(+) create mode 100644 manywheel/Dockerfile_2_28 diff --git a/.github/workflows/build-manywheel-images.yml b/.github/workflows/build-manywheel-images.yml index a599635f8..f308edcac 100644 --- a/.github/workflows/build-manywheel-images.yml +++ b/.github/workflows/build-manywheel-images.yml @@ -107,6 +107,21 @@ jobs: - name: Build Docker Image run: | manywheel/build_docker.sh + build-docker-cpu-manylinux_2_28: + runs-on: ubuntu-22.04 + env: + GPU_ARCH_TYPE: cpu-manylinux_2_28 + steps: + - name: Checkout PyTorch + uses: actions/checkout@v3 + - name: Authenticate if WITH_PUSH + run: | + if [[ "${WITH_PUSH}" == true ]]; then + echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin + fi + - name: Build Docker Image + run: | + manywheel/build_docker.sh build-docker-cpu-aarch64: runs-on: linux.arm64.2xlarge env: diff --git a/manywheel/Dockerfile_2_28 b/manywheel/Dockerfile_2_28 new file mode 100644 index 000000000..6566f115d --- /dev/null +++ b/manywheel/Dockerfile_2_28 @@ -0,0 +1,143 @@ +# syntax = docker/dockerfile:experimental +ARG ROCM_VERSION=3.7 +ARG BASE_CUDA_VERSION=11.8 +ARG GPU_IMAGE=amd64/almalinux:8 +FROM quay.io/pypa/manylinux_2_28_x86_64 as base + +ENV LC_ALL en_US.UTF-8 +ENV LANG en_US.UTF-8 +ENV LANGUAGE en_US.UTF-8 + +ARG DEVTOOLSET_VERSION=11 +RUN yum install -y wget curl perl util-linux xz bzip2 git patch which perl zlib-devel yum-utils gcc-toolset-${DEVTOOLSET_VERSION}-toolchain +ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH +ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH + +# cmake-3.18.4 from pip +RUN yum install -y python3-pip && \ + python3 -mpip install cmake==3.18.4 && \ + ln -s /usr/local/bin/cmake /usr/bin/cmake + +FROM base as openssl +# Install openssl (this must precede `build python` step) +# (In order to have a proper SSL module, Python is compiled +# against a recent openssl [see env vars above], which is linked +# statically. We delete openssl afterwards.) +ADD ./common/install_openssl.sh install_openssl.sh +RUN bash ./install_openssl.sh && rm install_openssl.sh + + +# remove unncessary python versions +RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2 +RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4 +RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6 +RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6 + +FROM base as cuda +ARG BASE_CUDA_VERSION=11.8 +# Install CUDA +ADD ./common/install_cuda.sh install_cuda.sh +RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh + +FROM base as intel +# MKL +ADD ./common/install_mkl.sh install_mkl.sh +RUN bash ./install_mkl.sh && rm install_mkl.sh + +FROM base as magma +ARG BASE_CUDA_VERSION=10.2 +# Install magma +ADD ./common/install_magma.sh install_magma.sh +RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh + +FROM base as jni +# Install java jni header +ADD ./common/install_jni.sh install_jni.sh +ADD ./java/jni.h jni.h +RUN bash ./install_jni.sh && rm install_jni.sh + +FROM base as libpng +# Install libpng +ADD ./common/install_libpng.sh install_libpng.sh +RUN bash ./install_libpng.sh && rm install_libpng.sh + +FROM ${GPU_IMAGE} as common +ARG DEVTOOLSET_VERSION=11 +ENV LC_ALL en_US.UTF-8 +ENV LANG en_US.UTF-8 +ENV LANGUAGE en_US.UTF-8 +RUN yum -y install epel-release +RUN yum -y update +RUN yum install -y \ + autoconf \ + automake \ + bison \ + bzip2 \ + curl \ + diffutils \ + file \ + git \ + make \ + patch \ + perl \ + unzip \ + util-linux \ + wget \ + which \ + xz \ + gcc-toolset-${DEVTOOLSET_VERSION}-toolchain + +RUN yum install -y \ + https://repo.ius.io/ius-release-el7.rpm \ + https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm +RUN yum swap -y git git236-core +# git236+ would refuse to run git commands in repos owned by other users +# Which causes version check to fail, as pytorch repo is bind-mounted into the image +# Override this behaviour by treating every folder as safe +# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327 +RUN git config --global --add safe.directory "*" + +ENV SSL_CERT_FILE=/opt/_internal/certs.pem +# Install LLVM version +COPY --from=openssl /opt/openssl /opt/openssl +COPY --from=base /opt/python /opt/python +COPY --from=base /opt/_internal /opt/_internal +COPY --from=base /usr/local/bin/auditwheel /usr/local/bin/auditwheel +COPY --from=intel /opt/intel /opt/intel +COPY --from=base /usr/local/bin/patchelf /usr/local/bin/patchelf +COPY --from=libpng /usr/local/bin/png* /usr/local/bin/ +COPY --from=libpng /usr/local/bin/libpng* /usr/local/bin/ +COPY --from=libpng /usr/local/include/png* /usr/local/include/ +COPY --from=libpng /usr/local/include/libpng* /usr/local/include/ +COPY --from=libpng /usr/local/lib/libpng* /usr/local/lib/ +COPY --from=libpng /usr/local/lib/pkgconfig /usr/local/lib/pkgconfig +COPY --from=jni /usr/local/include/jni.h /usr/local/include/jni.h + +FROM common as cpu_final +ARG BASE_CUDA_VERSION=11.8 +ARG DEVTOOLSET_VERSION=11 +# Ensure the expected devtoolset is used +ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH +ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH +# cmake +RUN yum install -y cmake3 && \ + ln -s /usr/bin/cmake3 /usr/bin/cmake + + +FROM cpu_final as cuda_final +RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION} +COPY --from=cuda /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda-${BASE_CUDA_VERSION} +COPY --from=magma /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda-${BASE_CUDA_VERSION} + +FROM common as rocm_final +ARG ROCM_VERSION=3.7 +# Install ROCm +ADD ./common/install_rocm.sh install_rocm.sh +RUN bash ./install_rocm.sh ${ROCM_VERSION} && rm install_rocm.sh +# cmake is already installed inside the rocm base image, but both 2 and 3 exist +# cmake3 is needed for the later MIOpen custom build, so that step is last. +RUN yum install -y cmake3 && \ + rm -f /usr/bin/cmake && \ + ln -s /usr/bin/cmake3 /usr/bin/cmake +ADD ./common/install_miopen.sh install_miopen.sh +RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh diff --git a/manywheel/build_docker.sh b/manywheel/build_docker.sh index 4d3816588..05f3dad81 100755 --- a/manywheel/build_docker.sh +++ b/manywheel/build_docker.sh @@ -20,6 +20,14 @@ case ${GPU_ARCH_TYPE} in GPU_IMAGE=centos:7 DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=9" ;; + cpu-manylinux_2_28) + TARGET=cpu_final + DOCKER_TAG=cpu + LEGACY_DOCKER_IMAGE=${DOCKER_REGISTRY}/pytorch/manylinux_2_28-cpu + GPU_IMAGE=amd64/almalinux:8 + DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11" + MANY_LINUX_VERSION="2_28" + ;; cpu-aarch64) TARGET=final DOCKER_TAG=cpu-aarch64 From 5625515ada66cacb0218b6db2bce0d4253fbaa62 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Fri, 10 May 2024 14:17:32 -0400 Subject: [PATCH 24/33] Add manylinux_2_28 image - fix cmake (#1817) * Manylinux 2_28 fix cmake install * fix --- .github/workflows/build-manywheel-images.yml | 2 ++ manywheel/Dockerfile_2_28 | 9 +++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build-manywheel-images.yml b/.github/workflows/build-manywheel-images.yml index f308edcac..33c0a12be 100644 --- a/.github/workflows/build-manywheel-images.yml +++ b/.github/workflows/build-manywheel-images.yml @@ -12,6 +12,7 @@ on: paths: - .github/workflows/build-manywheel-images.yml - manywheel/Dockerfile + - manywheel/Dockerfile_2_28 - manywheel/Dockerfile_aarch64 - manywheel/Dockerfile_cuda_aarch64 - manywheel/Dockerfile_cxx11-abi @@ -21,6 +22,7 @@ on: paths: - .github/workflows/build-manywheel-images.yml - manywheel/Dockerfile + - manywheel/Dockerfile_2_28 - manywheel/Dockerfile_aarch64 - manywheel/Dockerfile_cuda_aarch64 - manywheel/Dockerfile_cxx11-abi diff --git a/manywheel/Dockerfile_2_28 b/manywheel/Dockerfile_2_28 index 6566f115d..f5f21bf3d 100644 --- a/manywheel/Dockerfile_2_28 +++ b/manywheel/Dockerfile_2_28 @@ -16,7 +16,7 @@ ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/op # cmake-3.18.4 from pip RUN yum install -y python3-pip && \ python3 -mpip install cmake==3.18.4 && \ - ln -s /usr/local/bin/cmake /usr/bin/cmake + ln -s /usr/local/bin/cmake /usr/bin/cmake3 FROM base as openssl # Install openssl (this must precede `build python` step) @@ -119,10 +119,11 @@ ARG DEVTOOLSET_VERSION=11 # Ensure the expected devtoolset is used ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH -# cmake -RUN yum install -y cmake3 && \ - ln -s /usr/bin/cmake3 /usr/bin/cmake +# cmake-3.18.4 from pip +RUN yum install -y python3-pip && \ + python3 -mpip install cmake==3.18.4 && \ + ln -s /usr/local/bin/cmake /usr/bin/cmake3 FROM cpu_final as cuda_final RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION} From b47978fa08dff62235ebf64c0a99a499daf32c3c Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Mon, 13 May 2024 11:00:51 -0400 Subject: [PATCH 25/33] Add Almalinux to manywheel build script (#1818) --- manywheel/build_common.sh | 2 ++ manywheel/build_cpu.sh | 2 ++ manywheel/build_cuda.sh | 2 ++ manywheel/build_libtorch.sh | 6 ++++-- 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/manywheel/build_common.sh b/manywheel/build_common.sh index 2a6f37ec4..d68d9a323 100644 --- a/manywheel/build_common.sh +++ b/manywheel/build_common.sh @@ -25,6 +25,8 @@ retry () { OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release) if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then retry yum install -q -y zip openssl +elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then + retry yum install -q -y zip openssl elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then retry dnf install -q -y zip openssl elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then diff --git a/manywheel/build_cpu.sh b/manywheel/build_cpu.sh index 24c95f14e..9d982bd30 100755 --- a/manywheel/build_cpu.sh +++ b/manywheel/build_cpu.sh @@ -31,6 +31,8 @@ if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then LIBGOMP_PATH="/usr/lib64/libgomp.so.1" elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then LIBGOMP_PATH="/usr/lib64/libgomp.so.1" +elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then + LIBGOMP_PATH="/usr/lib64/libgomp.so.1" elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then if [[ "$(uname -m)" == "s390x" ]]; then LIBGOMP_PATH="/usr/lib/s390x-linux-gnu/libgomp.so.1" diff --git a/manywheel/build_cuda.sh b/manywheel/build_cuda.sh index 6dc2e0f90..ffc280e42 100644 --- a/manywheel/build_cuda.sh +++ b/manywheel/build_cuda.sh @@ -117,6 +117,8 @@ mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release) if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then LIBGOMP_PATH="/usr/lib64/libgomp.so.1" +elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then + LIBGOMP_PATH="/usr/lib64/libgomp.so.1" elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then LIBGOMP_PATH="/usr/lib64/libgomp.so.1" elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then diff --git a/manywheel/build_libtorch.sh b/manywheel/build_libtorch.sh index 2436d5b10..ea11f0c51 100644 --- a/manywheel/build_libtorch.sh +++ b/manywheel/build_libtorch.sh @@ -24,6 +24,8 @@ retry () { OS_NAME=`awk -F= '/^NAME/{print $2}' /etc/os-release` if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then retry yum install -q -y zip openssl +elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then + retry yum install -q -y zip openssl elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then retry dnf install -q -y zip openssl elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then @@ -278,7 +280,7 @@ for pkg in /$LIBTORCH_HOUSE_DIR/libtorch*.zip; do if [[ "$filepath" != "$destpath" ]]; then cp $filepath $destpath fi - + if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then patchedpath=$(fname_without_so_number $destpath) else @@ -299,7 +301,7 @@ for pkg in /$LIBTORCH_HOUSE_DIR/libtorch*.zip; do patchedname=${patched[i]} if [[ "$origname" != "$patchedname" ]] || [[ "$DESIRED_CUDA" == *"rocm"* ]]; then set +e - origname=$($PATCHELF_BIN --print-needed $sofile | grep "$origname.*") + origname=$($PATCHELF_BIN --print-needed $sofile | grep "$origname.*") ERRCODE=$? set -e if [ "$ERRCODE" -eq "0" ]; then From 242fa685747ba6c345a9c5cc41fa0d8b2f0d0c57 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Mon, 13 May 2024 11:34:32 -0400 Subject: [PATCH 26/33] [BE] Remove unused files and dead code (#1819) --- .circleci/scripts/binary_checkout.sh | 61 -------------------- .circleci/scripts/binary_populate_env.sh | 1 - cron/update_s3_htmls.sh | 71 ------------------------ manywheel/build_scripts/ssl-check.py | 33 ----------- manywheel/test_wheel.sh | 27 --------- 5 files changed, 193 deletions(-) delete mode 100755 .circleci/scripts/binary_checkout.sh delete mode 100755 cron/update_s3_htmls.sh delete mode 100644 manywheel/build_scripts/ssl-check.py delete mode 100755 manywheel/test_wheel.sh diff --git a/.circleci/scripts/binary_checkout.sh b/.circleci/scripts/binary_checkout.sh deleted file mode 100755 index b634f5c9a..000000000 --- a/.circleci/scripts/binary_checkout.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/bin/bash -# Derived from https://github.com/pytorch/pytorch/blob/2c7df1360aa17d4a6d6726998eede3671bcb36ee/.circleci/scripts/binary_populate_env.sh - -set -eux -o pipefail - -retry () { - $* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*) -} - - -# This step runs on multiple executors with different envfile locations -if [[ "$OSTYPE" == "msys" ]]; then - # windows executor (builds and tests) - rm -rf /c/w - ln -s "${HOME}" /c/w - WORK_DIR="/c/w" -elif [[ -d "/home/circleci/project" ]]; then - # machine executor (binary tests) - WORK_DIR="${HOME}/project" -else - # macos executor (builds and tests) - # docker executor (binary builds) - WORK_DIR="${HOME}" -fi - -if [[ "$OSTYPE" == "msys" ]]; then - # We need to make the paths as short as possible on Windows - PYTORCH_ROOT="$WORK_DIR/p" - BUILDER_ROOT="$WORK_DIR/b" -else - PYTORCH_ROOT="$WORK_DIR/pytorch" - BUILDER_ROOT="$WORK_DIR/builder" -fi - -# Persist these variables for the subsequent steps -echo "export WORK_DIR=${WORK_DIR}" >> ${BASH_ENV} -echo "export PYTORCH_ROOT=${PYTORCH_ROOT}" >> ${BASH_ENV} -echo "export BUILDER_ROOT=${BUILDER_ROOT}" >> ${BASH_ENV} - -# Clone the Pytorch branch -retry git clone --depth 1 https://github.com/pytorch/pytorch.git "$PYTORCH_ROOT" -# Removed checking out pytorch/pytorch using CIRCLE_PR_NUMBER and CIRCLE_SHA1 as -# those environment variables are tied to the host repo where the build is being -# triggered. -retry git submodule update --init --recursive -pushd "$PYTORCH_ROOT" -echo "Using Pytorch from " -git --no-pager log --max-count 1 -popd - -# Clone the Builder master repo -retry git clone -q https://github.com/pytorch/builder.git "$BUILDER_ROOT" -pushd "$BUILDER_ROOT" -if [[ -n "${CIRCLE_SHA1:-}" ]]; then - # Check out a specific commit (typically the latest) from pytorch/builder - git reset --hard "${CIRCLE_SHA1}" - git checkout -q -B main -fi -echo "Using builder from " -git --no-pager log --max-count 1 -popd diff --git a/.circleci/scripts/binary_populate_env.sh b/.circleci/scripts/binary_populate_env.sh index 7e663a64b..5b141ac38 100755 --- a/.circleci/scripts/binary_populate_env.sh +++ b/.circleci/scripts/binary_populate_env.sh @@ -143,7 +143,6 @@ export BUILD_JNI=$BUILD_JNI export PIP_UPLOAD_FOLDER="$PIP_UPLOAD_FOLDER" export DOCKER_IMAGE="$DOCKER_IMAGE" -# Remove WORKD_DIR, PYTORCH_ROOT, BUILDER_ROOT defined & persisted in binary_checkout.sh export MAC_PACKAGE_WORK_DIR="$WORK_DIR" export MINICONDA_ROOT="$WORK_DIR/miniconda" export PYTORCH_FINAL_PACKAGE_DIR="$WORK_DIR/final_pkgs" diff --git a/cron/update_s3_htmls.sh b/cron/update_s3_htmls.sh deleted file mode 100755 index 2dbd172a5..000000000 --- a/cron/update_s3_htmls.sh +++ /dev/null @@ -1,71 +0,0 @@ -#!/bin/bash - -set -e - -# Update the html links file in the s3 bucket Pip uses this html file to look -# through all the wheels and pick the most recently uploaded one (by the -# version, not the actual date of upload). There is one html file per cuda/cpu -# version - -# Upload for all CUDA/cpu versions if not given one to use -if [[ -z "$CUDA_VERSIONS" ]]; then - export CUDA_VERSIONS=('cpu' 'cu92' 'cu100' 'cu101' 'cu102' 'cu110' 'rocm5.0' 'rocm5.1.1') -fi - -if [[ -z "$HTML_NAME" ]]; then - export HTML_NAME='torch_nightly.html' -fi - -# Dry run disabled by default for legacy purposes -DRY_RUN=${DRY_RUN:-disabled} -DRY_RUN_FLAG="" -if [[ "${DRY_RUN}" != disabled ]]; then - DRY_RUN_FLAG="--dryrun" -fi - -# NB: includes trailing slash (from PIP_UPLOAD_FOLDER) -s3_base="s3://pytorch/whl/${PIP_UPLOAD_FOLDER}" - -# Pull all existing whls in this directory and turn them into html links -# N.B. we use the .dev as a hacky way to exclude all wheels with old -# 'yyyy.mm.dd' versions -# -# NB: replacing + with %2B is to fix old versions of pip which don't -# this transform automatically. This makes the display a little -# ugly but whatever -function generate_html() { - # Trailing slash required in both cases - dir="$1" - url_prefix="$2" - aws s3 ls "${s3_base}${dir}" | grep --only-matching '\S*\.whl' | sed 's#+#%2B#g' | sed 's#.*#'"${url_prefix}"'&
#g' -} - -# This will be included in all the sub-indices -generate_html '' '../' > "root-$HTML_NAME" -generate_html '' '' > "$HTML_NAME" - -for cuda_ver in "${CUDA_VERSIONS[@]}"; do - generate_html "${cuda_ver}/" "" > "${cuda_ver}-$HTML_NAME" - cat "root-$HTML_NAME" >> "${cuda_ver}-$HTML_NAME" - generate_html "${cuda_ver}/" "${cuda_ver}/" >> "$HTML_NAME" - - # Check your work every once in a while - echo "Setting ${cuda_ver}/$HTML_NAME to:" - cat "${cuda_ver}-$HTML_NAME" - ( - set -x - aws s3 cp ${DRY_RUN_FLAG} "${cuda_ver}-$HTML_NAME" "s3://pytorch/whl/${PIP_UPLOAD_FOLDER}${cuda_ver}/$HTML_NAME" --acl public-read --cache-control 'no-cache,no-store,must-revalidate' - ) - -done - -# Check your work every once in a while -echo "Setting $HTML_NAME to:" -cat "$HTML_NAME" -( - set -x - - # Upload the html file back up - # Note the lack of a / b/c duplicate / do cause problems in s3 - aws s3 cp ${DRY_RUN_FLAG} "$HTML_NAME" "$s3_base$HTML_NAME" --acl public-read --cache-control 'no-cache,no-store,must-revalidate' -) diff --git a/manywheel/build_scripts/ssl-check.py b/manywheel/build_scripts/ssl-check.py deleted file mode 100644 index b91927173..000000000 --- a/manywheel/build_scripts/ssl-check.py +++ /dev/null @@ -1,33 +0,0 @@ -# cf. https://github.com/pypa/manylinux/issues/53 - -GOOD_SSL = "https://google.com" -BAD_SSL = "https://self-signed.badssl.com" - -import sys - -print("Testing SSL certificate checking for Python:", sys.version) - -if (sys.version_info[:2] < (2, 7) - or sys.version_info[:2] < (3, 4)): - print("This version never checks SSL certs; skipping tests") - sys.exit(0) - -if sys.version_info[0] >= 3: - from urllib.request import urlopen - EXC = OSError -else: - from urllib import urlopen - EXC = IOError - -print("Connecting to %s should work" % (GOOD_SSL,)) -urlopen(GOOD_SSL) -print("...it did, yay.") - -print("Connecting to %s should fail" % (BAD_SSL,)) -try: - urlopen(BAD_SSL) - # If we get here then we failed: - print("...it DIDN'T!!!!!11!!1one!") - sys.exit(1) -except EXC: - print("...it did, yay.") \ No newline at end of file diff --git a/manywheel/test_wheel.sh b/manywheel/test_wheel.sh deleted file mode 100755 index ada7d93f0..000000000 --- a/manywheel/test_wheel.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env bash -set -e - -yum install -y wget git - -rm -rf /usr/local/cuda* - -# Install Anaconda -if ! ls /py -then - echo "Miniconda needs to be installed" - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh - bash ~/miniconda.sh -b -p /py -else - echo "Miniconda is already installed" -fi - -export PATH="/py/bin:$PATH" - -# Anaconda token -if ls /remote/token -then - source /remote/token -fi - -conda install -y conda-build anaconda-client - From 1f19db52205b57dddf12321e71fdfba933c4cce1 Mon Sep 17 00:00:00 2001 From: snadampal <87143774+snadampal@users.noreply.github.com> Date: Mon, 13 May 2024 12:54:00 -0500 Subject: [PATCH 27/33] arch64: CD: add manylinux_2_28 docker build workflow (#1784) --- .github/workflows/build-manywheel-images.yml | 17 ++++++ manywheel/Dockerfile_2_28_aarch64 | 56 ++++++++++++++++++++ manywheel/build_docker.sh | 8 +++ 3 files changed, 81 insertions(+) create mode 100644 manywheel/Dockerfile_2_28_aarch64 diff --git a/.github/workflows/build-manywheel-images.yml b/.github/workflows/build-manywheel-images.yml index 33c0a12be..d88fcbedd 100644 --- a/.github/workflows/build-manywheel-images.yml +++ b/.github/workflows/build-manywheel-images.yml @@ -14,6 +14,7 @@ on: - manywheel/Dockerfile - manywheel/Dockerfile_2_28 - manywheel/Dockerfile_aarch64 + - manywheel/Dockerfile_2_28_aarch64 - manywheel/Dockerfile_cuda_aarch64 - manywheel/Dockerfile_cxx11-abi - manywheel/build_docker.sh @@ -24,6 +25,7 @@ on: - manywheel/Dockerfile - manywheel/Dockerfile_2_28 - manywheel/Dockerfile_aarch64 + - manywheel/Dockerfile_2_28_aarch64 - manywheel/Dockerfile_cuda_aarch64 - manywheel/Dockerfile_cxx11-abi - 'common/*' @@ -139,6 +141,21 @@ jobs: - name: Build Docker Image run: | manywheel/build_docker.sh + build-docker-cpu-aarch64-2_28: + runs-on: linux.arm64.2xlarge + env: + GPU_ARCH_TYPE: cpu-aarch64-2_28 + steps: + - name: Checkout PyTorch + uses: actions/checkout@v3 + - name: Authenticate if WITH_PUSH + run: | + if [[ "${WITH_PUSH}" == true ]]; then + echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin + fi + - name: Build Docker Image + run: | + manywheel/build_docker.sh build-docker-cpu-cxx11-abi: runs-on: ubuntu-22.04 env: diff --git a/manywheel/Dockerfile_2_28_aarch64 b/manywheel/Dockerfile_2_28_aarch64 new file mode 100644 index 000000000..222d261ef --- /dev/null +++ b/manywheel/Dockerfile_2_28_aarch64 @@ -0,0 +1,56 @@ +FROM quay.io/pypa/manylinux_2_28_aarch64 as base + +# Graviton needs GCC 10 or above for the build. GCC12 is the default version in almalinux-8. +ARG GCCTOOLSET_VERSION=11 + +# Language variabes +ENV LC_ALL=en_US.UTF-8 +ENV LANG=en_US.UTF-8 +ENV LANGUAGE=en_US.UTF-8 + +# Installed needed OS packages. This is to support all +# the binary builds (torch, vision, audio, text, data) +RUN yum -y install epel-release +RUN yum -y update +RUN yum install -y \ + autoconf \ + automake \ + bison \ + bzip2 \ + curl \ + diffutils \ + file \ + git \ + less \ + libffi-devel \ + libgomp \ + make \ + openssl-devel \ + patch \ + perl \ + unzip \ + util-linux \ + wget \ + which \ + xz \ + yasm \ + zstd \ + gcc-toolset-${GCCTOOLSET_VERSION}-toolchain + +# Ensure the expected devtoolset is used +ENV PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/bin:$PATH +ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH + +# git236+ would refuse to run git commands in repos owned by other users +# Which causes version check to fail, as pytorch repo is bind-mounted into the image +# Override this behaviour by treating every folder as safe +# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327 +RUN git config --global --add safe.directory "*" + +FROM base as final + +# remove unncessary python versions +RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2 +RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4 +RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6 +RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6 diff --git a/manywheel/build_docker.sh b/manywheel/build_docker.sh index 05f3dad81..5c35d32ae 100755 --- a/manywheel/build_docker.sh +++ b/manywheel/build_docker.sh @@ -36,6 +36,14 @@ case ${GPU_ARCH_TYPE} in DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=10" MANY_LINUX_VERSION="aarch64" ;; + cpu-aarch64-2_28) + TARGET=final + DOCKER_TAG=cpu-aarch64 + LEGACY_DOCKER_IMAGE=${DOCKER_REGISTRY}/pytorch/manylinux_2_28-cpu-aarch64 + GPU_IMAGE=arm64v8/almalinux:8 + DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11" + MANY_LINUX_VERSION="2_28_aarch64" + ;; cpu-cxx11-abi) TARGET=final DOCKER_TAG=cpu-cxx11-abi From 455b572ecbc116bc40fa0bdae475d935c6280c58 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Mon, 13 May 2024 16:12:45 -0400 Subject: [PATCH 28/33] Revert "[BE] Remove unused files and dead code" (#1821) This reverts commit bebc062488523afe62b8ec90ee91455316448406. --- .circleci/scripts/binary_checkout.sh | 61 ++++++++++++++++++++ .circleci/scripts/binary_populate_env.sh | 1 + cron/update_s3_htmls.sh | 71 ++++++++++++++++++++++++ manywheel/build_scripts/ssl-check.py | 33 +++++++++++ manywheel/test_wheel.sh | 27 +++++++++ 5 files changed, 193 insertions(+) create mode 100755 .circleci/scripts/binary_checkout.sh create mode 100755 cron/update_s3_htmls.sh create mode 100644 manywheel/build_scripts/ssl-check.py create mode 100755 manywheel/test_wheel.sh diff --git a/.circleci/scripts/binary_checkout.sh b/.circleci/scripts/binary_checkout.sh new file mode 100755 index 000000000..b634f5c9a --- /dev/null +++ b/.circleci/scripts/binary_checkout.sh @@ -0,0 +1,61 @@ +#!/bin/bash +# Derived from https://github.com/pytorch/pytorch/blob/2c7df1360aa17d4a6d6726998eede3671bcb36ee/.circleci/scripts/binary_populate_env.sh + +set -eux -o pipefail + +retry () { + $* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*) +} + + +# This step runs on multiple executors with different envfile locations +if [[ "$OSTYPE" == "msys" ]]; then + # windows executor (builds and tests) + rm -rf /c/w + ln -s "${HOME}" /c/w + WORK_DIR="/c/w" +elif [[ -d "/home/circleci/project" ]]; then + # machine executor (binary tests) + WORK_DIR="${HOME}/project" +else + # macos executor (builds and tests) + # docker executor (binary builds) + WORK_DIR="${HOME}" +fi + +if [[ "$OSTYPE" == "msys" ]]; then + # We need to make the paths as short as possible on Windows + PYTORCH_ROOT="$WORK_DIR/p" + BUILDER_ROOT="$WORK_DIR/b" +else + PYTORCH_ROOT="$WORK_DIR/pytorch" + BUILDER_ROOT="$WORK_DIR/builder" +fi + +# Persist these variables for the subsequent steps +echo "export WORK_DIR=${WORK_DIR}" >> ${BASH_ENV} +echo "export PYTORCH_ROOT=${PYTORCH_ROOT}" >> ${BASH_ENV} +echo "export BUILDER_ROOT=${BUILDER_ROOT}" >> ${BASH_ENV} + +# Clone the Pytorch branch +retry git clone --depth 1 https://github.com/pytorch/pytorch.git "$PYTORCH_ROOT" +# Removed checking out pytorch/pytorch using CIRCLE_PR_NUMBER and CIRCLE_SHA1 as +# those environment variables are tied to the host repo where the build is being +# triggered. +retry git submodule update --init --recursive +pushd "$PYTORCH_ROOT" +echo "Using Pytorch from " +git --no-pager log --max-count 1 +popd + +# Clone the Builder master repo +retry git clone -q https://github.com/pytorch/builder.git "$BUILDER_ROOT" +pushd "$BUILDER_ROOT" +if [[ -n "${CIRCLE_SHA1:-}" ]]; then + # Check out a specific commit (typically the latest) from pytorch/builder + git reset --hard "${CIRCLE_SHA1}" + git checkout -q -B main +fi +echo "Using builder from " +git --no-pager log --max-count 1 +popd diff --git a/.circleci/scripts/binary_populate_env.sh b/.circleci/scripts/binary_populate_env.sh index 5b141ac38..7e663a64b 100755 --- a/.circleci/scripts/binary_populate_env.sh +++ b/.circleci/scripts/binary_populate_env.sh @@ -143,6 +143,7 @@ export BUILD_JNI=$BUILD_JNI export PIP_UPLOAD_FOLDER="$PIP_UPLOAD_FOLDER" export DOCKER_IMAGE="$DOCKER_IMAGE" +# Remove WORKD_DIR, PYTORCH_ROOT, BUILDER_ROOT defined & persisted in binary_checkout.sh export MAC_PACKAGE_WORK_DIR="$WORK_DIR" export MINICONDA_ROOT="$WORK_DIR/miniconda" export PYTORCH_FINAL_PACKAGE_DIR="$WORK_DIR/final_pkgs" diff --git a/cron/update_s3_htmls.sh b/cron/update_s3_htmls.sh new file mode 100755 index 000000000..2dbd172a5 --- /dev/null +++ b/cron/update_s3_htmls.sh @@ -0,0 +1,71 @@ +#!/bin/bash + +set -e + +# Update the html links file in the s3 bucket Pip uses this html file to look +# through all the wheels and pick the most recently uploaded one (by the +# version, not the actual date of upload). There is one html file per cuda/cpu +# version + +# Upload for all CUDA/cpu versions if not given one to use +if [[ -z "$CUDA_VERSIONS" ]]; then + export CUDA_VERSIONS=('cpu' 'cu92' 'cu100' 'cu101' 'cu102' 'cu110' 'rocm5.0' 'rocm5.1.1') +fi + +if [[ -z "$HTML_NAME" ]]; then + export HTML_NAME='torch_nightly.html' +fi + +# Dry run disabled by default for legacy purposes +DRY_RUN=${DRY_RUN:-disabled} +DRY_RUN_FLAG="" +if [[ "${DRY_RUN}" != disabled ]]; then + DRY_RUN_FLAG="--dryrun" +fi + +# NB: includes trailing slash (from PIP_UPLOAD_FOLDER) +s3_base="s3://pytorch/whl/${PIP_UPLOAD_FOLDER}" + +# Pull all existing whls in this directory and turn them into html links +# N.B. we use the .dev as a hacky way to exclude all wheels with old +# 'yyyy.mm.dd' versions +# +# NB: replacing + with %2B is to fix old versions of pip which don't +# this transform automatically. This makes the display a little +# ugly but whatever +function generate_html() { + # Trailing slash required in both cases + dir="$1" + url_prefix="$2" + aws s3 ls "${s3_base}${dir}" | grep --only-matching '\S*\.whl' | sed 's#+#%2B#g' | sed 's#.*#'"${url_prefix}"'&
#g' +} + +# This will be included in all the sub-indices +generate_html '' '../' > "root-$HTML_NAME" +generate_html '' '' > "$HTML_NAME" + +for cuda_ver in "${CUDA_VERSIONS[@]}"; do + generate_html "${cuda_ver}/" "" > "${cuda_ver}-$HTML_NAME" + cat "root-$HTML_NAME" >> "${cuda_ver}-$HTML_NAME" + generate_html "${cuda_ver}/" "${cuda_ver}/" >> "$HTML_NAME" + + # Check your work every once in a while + echo "Setting ${cuda_ver}/$HTML_NAME to:" + cat "${cuda_ver}-$HTML_NAME" + ( + set -x + aws s3 cp ${DRY_RUN_FLAG} "${cuda_ver}-$HTML_NAME" "s3://pytorch/whl/${PIP_UPLOAD_FOLDER}${cuda_ver}/$HTML_NAME" --acl public-read --cache-control 'no-cache,no-store,must-revalidate' + ) + +done + +# Check your work every once in a while +echo "Setting $HTML_NAME to:" +cat "$HTML_NAME" +( + set -x + + # Upload the html file back up + # Note the lack of a / b/c duplicate / do cause problems in s3 + aws s3 cp ${DRY_RUN_FLAG} "$HTML_NAME" "$s3_base$HTML_NAME" --acl public-read --cache-control 'no-cache,no-store,must-revalidate' +) diff --git a/manywheel/build_scripts/ssl-check.py b/manywheel/build_scripts/ssl-check.py new file mode 100644 index 000000000..b91927173 --- /dev/null +++ b/manywheel/build_scripts/ssl-check.py @@ -0,0 +1,33 @@ +# cf. https://github.com/pypa/manylinux/issues/53 + +GOOD_SSL = "https://google.com" +BAD_SSL = "https://self-signed.badssl.com" + +import sys + +print("Testing SSL certificate checking for Python:", sys.version) + +if (sys.version_info[:2] < (2, 7) + or sys.version_info[:2] < (3, 4)): + print("This version never checks SSL certs; skipping tests") + sys.exit(0) + +if sys.version_info[0] >= 3: + from urllib.request import urlopen + EXC = OSError +else: + from urllib import urlopen + EXC = IOError + +print("Connecting to %s should work" % (GOOD_SSL,)) +urlopen(GOOD_SSL) +print("...it did, yay.") + +print("Connecting to %s should fail" % (BAD_SSL,)) +try: + urlopen(BAD_SSL) + # If we get here then we failed: + print("...it DIDN'T!!!!!11!!1one!") + sys.exit(1) +except EXC: + print("...it did, yay.") \ No newline at end of file diff --git a/manywheel/test_wheel.sh b/manywheel/test_wheel.sh new file mode 100755 index 000000000..ada7d93f0 --- /dev/null +++ b/manywheel/test_wheel.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +set -e + +yum install -y wget git + +rm -rf /usr/local/cuda* + +# Install Anaconda +if ! ls /py +then + echo "Miniconda needs to be installed" + wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh + bash ~/miniconda.sh -b -p /py +else + echo "Miniconda is already installed" +fi + +export PATH="/py/bin:$PATH" + +# Anaconda token +if ls /remote/token +then + source /remote/token +fi + +conda install -y conda-build anaconda-client + From 85e8b9f6b674e53c459dfd30dcd82ebd482bf5ae Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Tue, 14 May 2024 10:46:15 -0400 Subject: [PATCH 29/33] Add manylinux_2_28 cuda docker images (#1820) --- .github/workflows/build-manywheel-images.yml | 21 ++++++++++++++++++++ manywheel/build_docker.sh | 8 ++++++++ 2 files changed, 29 insertions(+) diff --git a/.github/workflows/build-manywheel-images.yml b/.github/workflows/build-manywheel-images.yml index d88fcbedd..333f3de5d 100644 --- a/.github/workflows/build-manywheel-images.yml +++ b/.github/workflows/build-manywheel-images.yml @@ -60,6 +60,27 @@ jobs: - name: Build Docker Image run: | manywheel/build_docker.sh + build-docker-cuda-manylinux_2_28: + runs-on: linux.12xlarge.ephemeral + strategy: + matrix: + cuda_version: ["12.4", "12.1", "11.8"] + env: + GPU_ARCH_TYPE: cuda-manylinux_2_28 + GPU_ARCH_VERSION: ${{ matrix.cuda_version }} + steps: + - name: Purge tools folder (free space for build) + run: rm -rf /opt/hostedtoolcache + - name: Checkout PyTorch builder + uses: actions/checkout@v3 + - name: Authenticate if WITH_PUSH + run: | + if [[ "${WITH_PUSH}" == true ]]; then + echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin + fi + - name: Build Docker Image + run: | + manywheel/build_docker.sh build-docker-cuda-aarch64: runs-on: linux.arm64.2xlarge strategy: diff --git a/manywheel/build_docker.sh b/manywheel/build_docker.sh index 5c35d32ae..819a4a003 100755 --- a/manywheel/build_docker.sh +++ b/manywheel/build_docker.sh @@ -68,6 +68,14 @@ case ${GPU_ARCH_TYPE} in GPU_IMAGE=centos:7 DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=9" ;; + cuda-manylinux_2_28) + TARGET=cuda_final + DOCKER_TAG=cuda${GPU_ARCH_VERSION} + LEGACY_DOCKER_IMAGE=${DOCKER_REGISTRY}/pytorch/manylinux_2_28-cuda${GPU_ARCH_VERSION//./} + GPU_IMAGE=amd64/almalinux:8 + DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=11" + MANY_LINUX_VERSION="2_28" + ;; cuda-aarch64) TARGET=cuda_final DOCKER_TAG=cuda${GPU_ARCH_VERSION} From 06ca2923b9c1d5bcac1bb4b05f7787749a997927 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Fri, 17 May 2024 13:38:39 -0400 Subject: [PATCH 30/33] [Validations] Turn off CUDA exception catch test (#1825) --- .github/scripts/validate_binaries.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/validate_binaries.sh b/.github/scripts/validate_binaries.sh index bf5c15690..835c16c63 100755 --- a/.github/scripts/validate_binaries.sh +++ b/.github/scripts/validate_binaries.sh @@ -62,7 +62,7 @@ else if [[ ${TARGET_OS} == 'windows' ]]; then python ./test/smoke_test/smoke_test.py ${TEST_SUFFIX} else - python3 ./test/smoke_test/smoke_test.py ${TEST_SUFFIX} + python3 ./test/smoke_test/smoke_test.py ${TEST_SUFFIX} --runtime-error-check disabled fi if [[ ${TARGET_OS} == 'macos-arm64' ]]; then From 4116508f78c9f47dc89d0faae164e23cd6859138 Mon Sep 17 00:00:00 2001 From: Ting Lu Date: Sun, 19 May 2024 23:16:06 -0700 Subject: [PATCH 31/33] test with linker script enabled --- aarch64_linux/aarch64_wheel_ci_build.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aarch64_linux/aarch64_wheel_ci_build.py b/aarch64_linux/aarch64_wheel_ci_build.py index 2444429cb..298525144 100755 --- a/aarch64_linux/aarch64_wheel_ci_build.py +++ b/aarch64_linux/aarch64_wheel_ci_build.py @@ -199,7 +199,7 @@ def parse_arguments(): branch = "master" print("Building PyTorch wheel") - build_vars = "MAX_JOBS=5 CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 " + build_vars = "MAX_JOBS=5 USE_PRIORITIZED_TEXT_FOR_LD=1 CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 " os.system("cd /pytorch; python setup.py clean") override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION") From d1baef5ed76ad2e86b76d2b51e3600b42056fb1c Mon Sep 17 00:00:00 2001 From: Ting Lu Date: Mon, 20 May 2024 00:13:30 -0700 Subject: [PATCH 32/33] reapply acl version 24.04 as git history is messed --- aarch64_linux/aarch64_wheel_ci_build.py | 2 +- aarch64_linux/build_aarch64_wheel.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/aarch64_linux/aarch64_wheel_ci_build.py b/aarch64_linux/aarch64_wheel_ci_build.py index 298525144..e9bd7df62 100755 --- a/aarch64_linux/aarch64_wheel_ci_build.py +++ b/aarch64_linux/aarch64_wheel_ci_build.py @@ -78,7 +78,7 @@ def build_ArmComputeLibrary() -> None: "clone", "https://github.com/ARM-software/ComputeLibrary.git", "-b", - "v23.08", + "v24.04", "--depth", "1", "--shallow-submodules", diff --git a/aarch64_linux/build_aarch64_wheel.py b/aarch64_linux/build_aarch64_wheel.py index 0ff286ad2..3956f0463 100755 --- a/aarch64_linux/build_aarch64_wheel.py +++ b/aarch64_linux/build_aarch64_wheel.py @@ -229,7 +229,7 @@ def build_ArmComputeLibrary(host: RemoteHost, git_clone_flags: str = "") -> None print('Building Arm Compute Library') acl_build_flags=" ".join(["debug=0", "neon=1", "opencl=0", "os=linux", "openmp=1", "cppthreads=0", "arch=armv8a", "multi_isa=1", "fixed_format_kernels=1", "build=native"]) - host.run_cmd(f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v23.08 {git_clone_flags}") + host.run_cmd(f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v24.04 {git_clone_flags}") host.run_cmd(f"cd ComputeLibrary && scons Werror=1 -j8 {acl_build_flags}") From d7ffad81c370f2ae484f0b99f2a4454eff547ff8 Mon Sep 17 00:00:00 2001 From: Ting Lu Date: Mon, 20 May 2024 06:37:40 -0700 Subject: [PATCH 33/33] Use export USE_PRIORITIZED_TEXT_FOR_LD=1 instead of command line --- aarch64_linux/aarch64_wheel_ci_build.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/aarch64_linux/aarch64_wheel_ci_build.py b/aarch64_linux/aarch64_wheel_ci_build.py index e9bd7df62..6739d24de 100755 --- a/aarch64_linux/aarch64_wheel_ci_build.py +++ b/aarch64_linux/aarch64_wheel_ci_build.py @@ -199,7 +199,8 @@ def parse_arguments(): branch = "master" print("Building PyTorch wheel") - build_vars = "MAX_JOBS=5 USE_PRIORITIZED_TEXT_FOR_LD=1 CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 " + os.system("export USE_PRIORITIZED_TEXT_FOR_LD=1") + build_vars = "MAX_JOBS=5 CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 " os.system("cd /pytorch; python setup.py clean") override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")