diff --git a/check_binary.sh b/check_binary.sh index 153fca745..f38900996 100755 --- a/check_binary.sh +++ b/check_binary.sh @@ -13,7 +13,7 @@ set -eux -o pipefail # 8. Magma is available for CUDA builds # 9. CuDNN is available for CUDA builds # -# This script needs the env variables DESIRED_PYTHON, DESIRED_CUDA, +# This script needs the env variables DESIRED_PYTHON, GPU_ARCH_VERSION # DESIRED_DEVTOOLSET and PACKAGE_TYPE # # This script expects PyTorch to be installed into the active Python (the @@ -38,14 +38,9 @@ else install_root="$(dirname $(which python))/../lib/python${py_dot}/site-packages/torch/" fi -if [[ "$DESIRED_CUDA" != 'cpu' && "$DESIRED_CUDA" != *"rocm"* ]]; then - # cu90, cu92, cu100, cu101 - if [[ ${#DESIRED_CUDA} -eq 4 ]]; then - CUDA_VERSION="${DESIRED_CUDA:2:1}.${DESIRED_CUDA:3:1}" - elif [[ ${#DESIRED_CUDA} -eq 5 ]]; then - CUDA_VERSION="${DESIRED_CUDA:2:2}.${DESIRED_CUDA:4:1}" - fi - echo "Using CUDA $CUDA_VERSION as determined by DESIRED_CUDA" +if [[ "$GPU_ARCH_TYPE" = 'cuda' ]]; then + CUDA_VERSION=${GPU_ARCH_VERSION} + echo "Using CUDA $CUDA_VERSION as determined by GPU_ARCH_VERSION" # Switch `/usr/local/cuda` to the desired CUDA version rm -rf /usr/local/cuda || true @@ -366,7 +361,7 @@ if [[ "$OSTYPE" == "msys" ]]; then fi # Test that CUDA builds are setup correctly -if [[ "$DESIRED_CUDA" != 'cpu' && "$DESIRED_CUDA" != *"rocm"* ]]; then +if [[ "$GPU_ARCH_TYPE" == 'cuda' ]]; then if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then build_and_run_example_cpp check-torch-cuda else diff --git a/conda/Dockerfile b/conda/Dockerfile index f4f4c834a..ce9cc8b3a 100644 --- a/conda/Dockerfile +++ b/conda/Dockerfile @@ -43,26 +43,32 @@ RUN bash ./install_conda.sh && rm install_conda.sh FROM base as cuda RUN rm -rf /usr/local/cuda-* ADD ./common/install_cuda.sh install_cuda.sh +ENV GPU_ARCH_TYPE=cuda FROM cuda as cuda10.2 RUN bash ./install_cuda.sh 10.2 ENV DESIRED_CUDA=10.2 +ENV GPU_ARCH_TYPE=10.2 FROM cuda as cuda11.3 RUN bash ./install_cuda.sh 11.3 ENV DESIRED_CUDA=11.3 +ENV GPU_ARCH_TYPE=11.3 FROM cuda as cuda11.5 RUN bash ./install_cuda.sh 11.5 ENV DESIRED_CUDA=11.5 +ENV GPU_ARCH_TYPE=11.5 FROM cuda as cuda11.6 RUN bash ./install_cuda.sh 11.6 ENV DESIRED_CUDA=11.6 +ENV GPU_ARCH_TYPE=11.6 FROM cuda as cuda11.7 RUN bash ./install_cuda.sh 11.7 ENV DESIRED_CUDA=11.7 +ENV GPU_ARCH_TYPE=11.7 # Install MNIST test data FROM base as mnist diff --git a/conda/README.md b/conda/README.md index 4c1b719f6..4044882ae 100644 --- a/conda/README.md +++ b/conda/README.md @@ -20,7 +20,6 @@ docker push pytorch/conda-builder # building pytorch docker run --rm -it \ -e PACKAGE_TYPE=conda \ - -e DESIRED_CUDA=cu92 \ -e DESIRED_PYTHON=3.8 \ -e PYTORCH_BUILD_VERSION=1.5.0 \ -e PYTORCH_BUILD_NUMBER=1 \ diff --git a/conda/build.sh b/conda/build.sh index ab6176884..0b4e55c14 100755 --- a/conda/build.sh +++ b/conda/build.sh @@ -5,4 +5,4 @@ # TODO: Remove this once we fully move binary builds on master to GHA SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -DESIRED_CUDA=${DESIRED_CUDA:-cpu} bash ${SCRIPTPATH}/build_pytorch.sh +bash ${SCRIPTPATH}/build_pytorch.sh diff --git a/conda/build_pytorch.sh b/conda/build_pytorch.sh index 8ac05edbd..29d5247b0 100755 --- a/conda/build_pytorch.sh +++ b/conda/build_pytorch.sh @@ -33,8 +33,8 @@ retry () { # Parse arguments and determmine version ########################################################### -if [[ -n "$DESIRED_CUDA" && -n "$PYTORCH_BUILD_VERSION" && -n "$PYTORCH_BUILD_NUMBER" ]]; then - desired_cuda="$DESIRED_CUDA" +if [[ -n "$GPU_ARCH_VERSION" && -n "$PYTORCH_BUILD_VERSION" && -n "$PYTORCH_BUILD_NUMBER" ]]; then + desired_cuda="$GPU_ARCH_VERSION" build_version="$PYTORCH_BUILD_VERSION" build_number="$PYTORCH_BUILD_NUMBER" else @@ -49,9 +49,11 @@ else build_version="$2" build_number="$3" fi -if [[ "$desired_cuda" != cpu ]]; then - desired_cuda="$(echo $desired_cuda | tr -d cuda. )" + +if [[ $desired_cuda = "" ]]; then + desired_cuda="cpu" fi + echo "Building cuda version $desired_cuda and pytorch version: $build_version build_number: $build_number" if [[ "$OSTYPE" == "msys" ]]; then diff --git a/manywheel/build_common.sh b/manywheel/build_common.sh index 95a67cdd9..e7acb6f5d 100644 --- a/manywheel/build_common.sh +++ b/manywheel/build_common.sh @@ -147,7 +147,7 @@ else export LLVM_DIR="$USE_LLVM/lib/cmake/llvm" fi -if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then +if [[ "$GPU_ARCH_TYPE" = "rocm" ]]; then echo "Calling build_amd.py at $(date)" python tools/amd_build/build_amd.py fi @@ -326,7 +326,7 @@ for pkg in /$WHEELHOUSE_DIR/torch*linux*.whl /$LIBTORCH_HOUSE_DIR/libtorch*.zip; fi # ROCm workaround for roctracer dlopens - if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then + if [[ "${GPU_ARCH_TYPE}" = "rocm" ]]; then patchedpath=$(fname_without_so_number $destpath) else patchedpath=$(fname_with_sha256 $destpath) @@ -459,7 +459,10 @@ if [[ -z "$BUILD_PYTHONLESS" ]]; then echo "$(date) :: Running tests" pushd "$PYTORCH_ROOT" LD_LIBRARY_PATH=/usr/local/nvidia/lib64 \ - "${SOURCE_DIR}/../run_tests.sh" manywheel "${py_majmin}" "$DESIRED_CUDA" + PACAKGE_TYPE=manywheel \ + DESIRED_PYTHON="${py_majmin}" \ + GPU_ARCH_TYPE=${GPU_ARCH_TYPE} \ + "${SOURCE_DIR}/../run_tests.sh" popd echo "$(date) :: Finished tests" fi diff --git a/manywheel/build_cuda.sh b/manywheel/build_cuda.sh index efea1ae93..070e1e22b 100644 --- a/manywheel/build_cuda.sh +++ b/manywheel/build_cuda.sh @@ -24,37 +24,9 @@ if [[ -z "$EXTRA_CAFFE2_CMAKE_FLAGS" ]]; then EXTRA_CAFFE2_CMAKE_FLAGS=() fi -# Determine CUDA version and architectures to build for -# -# NOTE: We should first check `DESIRED_CUDA` when determining `CUDA_VERSION`, -# because in some cases a single Docker image can have multiple CUDA versions -# on it, and `nvcc --version` might not show the CUDA version we want. -if [[ -n "$DESIRED_CUDA" ]]; then - # If the DESIRED_CUDA already matches the format that we expect - if [[ ${DESIRED_CUDA} =~ ^[0-9]+\.[0-9]+$ ]]; then - CUDA_VERSION=${DESIRED_CUDA} - else - # cu90, cu92, cu100, cu101 - if [[ ${#DESIRED_CUDA} -eq 4 ]]; then - CUDA_VERSION="${DESIRED_CUDA:2:1}.${DESIRED_CUDA:3:1}" - elif [[ ${#DESIRED_CUDA} -eq 5 ]]; then - CUDA_VERSION="${DESIRED_CUDA:2:2}.${DESIRED_CUDA:4:1}" - fi - fi - echo "Using CUDA $CUDA_VERSION as determined by DESIRED_CUDA" - # There really has to be a better way to do this - eli - # Possibly limiting builds to specific cuda versions be delimiting images would be a choice - if [[ "$OS_NAME" == *"Ubuntu"* ]]; then - echo "Switching to CUDA version $desired_cuda" - /builder/conda/switch_cuda_version.sh "${DESIRED_CUDA}" - fi -else - CUDA_VERSION=$(nvcc --version|grep release|cut -f5 -d" "|cut -f1 -d",") - echo "CUDA $CUDA_VERSION Detected" -fi - -cuda_version_nodot=$(echo $CUDA_VERSION | tr -d '.') +CUDA_VERSION="${GPU_ARCH_VERSION:-}" +cuda_version_nodot=$(echo "${CUDA_VERSION}" | tr -d '.') TORCH_CUDA_ARCH_LIST="3.7;5.0;6.0;7.0" case ${CUDA_VERSION} in @@ -63,7 +35,6 @@ case ${CUDA_VERSION} in EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON") ;; 10.*) - TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST}" EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON") ;; *) @@ -273,9 +244,6 @@ else exit 1 fi -# builder/test.sh requires DESIRED_CUDA to know what tests to exclude -export DESIRED_CUDA="$cuda_version_nodot" - # Switch `/usr/local/cuda` to the desired CUDA version rm -rf /usr/local/cuda || true ln -s "/usr/local/cuda-${CUDA_VERSION}" /usr/local/cuda diff --git a/manywheel/build_libtorch.sh b/manywheel/build_libtorch.sh index f481dfd90..d7a77eabe 100644 --- a/manywheel/build_libtorch.sh +++ b/manywheel/build_libtorch.sh @@ -113,7 +113,7 @@ else export LLVM_DIR="$USE_LLVM/lib/cmake/llvm" fi -if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then +if [[ "$GPU_ARCH_TYPE" = "rocm" ]]; then echo "Calling build_amd.py at $(date)" python tools/amd_build/build_amd.py # TODO remove this work-around once pytorch sources are updated diff --git a/manywheel/build_rocm.sh b/manywheel/build_rocm.sh index 9b4d36348..bb70053b4 100755 --- a/manywheel/build_rocm.sh +++ b/manywheel/build_rocm.sh @@ -25,19 +25,8 @@ if [[ -z "$EXTRA_CAFFE2_CMAKE_FLAGS" ]]; then fi # Determine ROCm version and architectures to build for -# -# NOTE: We should first check `DESIRED_CUDA` when determining `ROCM_VERSION` -if [[ -n "$DESIRED_CUDA" ]]; then - if ! echo "${DESIRED_CUDA}"| grep "^rocm" >/dev/null 2>/dev/null; then - export DESIRED_CUDA="rocm${DESIRED_CUDA}" - fi - # rocm3.7, rocm3.5.1 - ROCM_VERSION="$DESIRED_CUDA" - echo "Using $ROCM_VERSION as determined by DESIRED_CUDA" -else - echo "Must set DESIRED_CUDA" - exit 1 -fi +ROCM_VERSION="$GPU_ARCH_VERSION" +echo "Using $ROCM_VERSION as determined by GPU_ARCH_VERSION" # Package directories WHEELHOUSE_DIR="wheelhouse$ROCM_VERSION" diff --git a/run_tests.sh b/run_tests.sh index 18b00f00b..a3b597cc6 100755 --- a/run_tests.sh +++ b/run_tests.sh @@ -35,31 +35,18 @@ retry () { # Parameters ############################################################################## if [[ "$#" != 3 ]]; then - if [[ -z "${DESIRED_PYTHON:-}" || -z "${DESIRED_CUDA:-}" || -z "${PACKAGE_TYPE:-}" ]]; then - echo "USAGE: run_tests.sh PACKAGE_TYPE DESIRED_PYTHON DESIRED_CUDA" + if [[ -z "${DESIRED_PYTHON:-}" || -z "${PACKAGE_TYPE:-}" || -z "${GPU_ARCH_TYPE}" ]]; then + echo "USAGE: run_tests.sh PACKAGE_TYPE DESIRED_PYTHON GPU_ARCH_TYPE" echo "The env variable PACKAGE_TYPE must be set to 'conda' or 'manywheel' or 'libtorch'" echo "The env variable DESIRED_PYTHON must be set like '2.7mu' or '3.6m' etc" - echo "The env variable DESIRED_CUDA must be set like 'cpu' or 'cu80' etc" + echo "The env variable GPU_ARCH_TYPE must be set like 'cpu' or 'cuda' or 'rocm' etc" exit 1 fi package_type="$PACKAGE_TYPE" py_ver="$DESIRED_PYTHON" - cuda_ver="$DESIRED_CUDA" else package_type="$1" py_ver="$2" - cuda_ver="$3" -fi - -if [[ "$cuda_ver" == 'cpu-cxx11-abi' ]]; then - cuda_ver="cpu" -fi - -# cu80, cu90, cu100, cpu -if [[ ${#cuda_ver} -eq 4 ]]; then - cuda_ver_majmin="${cuda_ver:2:1}.${cuda_ver:3:1}" -elif [[ ${#cuda_ver} -eq 5 ]]; then - cuda_ver_majmin="${cuda_ver:2:2}.${cuda_ver:4:1}" fi NUMPY_PACKAGE="" @@ -80,7 +67,7 @@ if [[ "$package_type" == conda || "$(uname)" == Darwin ]]; then # overwrite the currently installed "local" pytorch package meaning you aren't actually testing # the right package. # TODO (maybe): Make the "cpu" package of pytorch depend on "cpuonly" - if [[ "$cuda_ver" = 'cpu' ]]; then + if [[ "${GPU_ARCH_TYPE}" = 'cpu' ]]; then # Installing cpuonly will also install dependencies as well retry conda install -y -c pytorch cpuonly else @@ -139,7 +126,7 @@ echo "Checking that we are testing the package that is just built" python -c "import torch; exit(0 if torch.__version__ == '$expected_version' else 1)" # Test that CUDA builds are setup correctly -if [[ "$cuda_ver" != 'cpu' ]]; then +if [[ "${GPU_ARCH_TYPE}" != 'cpu' ]]; then # Test CUDA archs echo "Checking that CUDA archs are setup correctly" timeout 20 python -c 'import torch; torch.randn([3,5]).cuda()' @@ -165,573 +152,3 @@ if [[ "$(uname)" == 'Darwin' ]]; then fi popd - -# TODO re-enable the other tests after the nightlies are moved to CI. This is -# because the binaries keep breaking, often from additional tests, that aren't -# real problems. Once these are on circleci and a smoke-binary-build is added -# to PRs then this should stop happening and these can be re-enabled. -echo "Not running unit tests. Hopefully these problems are caught by CI" -exit 0 - - -############################################################################## -# Running unit tests (except not right now) -############################################################################## -echo "$(date) :: Starting tests for $package_type package for python$py_ver and $cuda_ver" - -# We keep track of exact tests to skip, as otherwise we would be hardly running -# any tests. But b/c of issues working with pytest/normal-python-test/ and b/c -# of special snowflake tests in test/run_test.py we also take special care of -# those -tests_to_skip=() - -# -# Entire file exclusions -############################################################################## -entire_file_exclusions=("-x") - -# cpp_extensions doesn't work with pytest, so we exclude it from the pytest run -# here and then manually run it later. Note that this is only because this -# entire_fil_exclusions flag is only passed to the pytest run -entire_file_exclusions+=("cpp_extensions") - -# TODO temporary line to fix next days nightlies, but should be removed when -# issue is fixed -entire_file_exclusions+=('type_info') - -if [[ "$cuda_ver" == 'cpu' ]]; then - # test/test_cuda.py exits early if the installed torch is not built with - # CUDA, but the exit doesn't work when running with pytest, so pytest will - # still try to run all the CUDA tests and then fail - entire_file_exclusions+=("cuda") - entire_file_exclusions+=("nccl") -fi - -if [[ "$(uname)" == 'Darwin' || "$OSTYPE" == "msys" ]]; then - # pytest on Mac doesn't like the exits in these files - entire_file_exclusions+=('c10d') - entire_file_exclusions+=('distributed') - - # pytest doesn't mind the exit but fails the tests. On Mac we run this - # later without pytest - entire_file_exclusions+=('thd_distributed') -fi - - -# -# Universal flaky tests -############################################################################## - -# RendezvousEnvTest sometimes hangs forever -# Otherwise it will fail on CUDA with -# Traceback (most recent call last): -# File "test_c10d.py", line 179, in test_common_errors -# next(gen) -# AssertionError: ValueError not raised -tests_to_skip+=('RendezvousEnvTest and test_common_errors') - -# This hung forever once on conda_3.5_cu92 -tests_to_skip+=('TestTorch and test_sum_dim') - -# test_trace_warn isn't actually flaky, but it doesn't work with pytest so we -# just skip it -tests_to_skip+=('TestJit and test_trace_warn') -# -# Python specific flaky tests -############################################################################## - -# test_dataloader.py:721: AssertionError -# looks like a timeout, but interestingly only appears on python 3 -if [[ "$py_ver" == 3* ]]; then - tests_to_skip+=('TestDataLoader and test_proper_exit') -fi - -# -# CUDA flaky tests, all package types -############################################################################## -if [[ "$cuda_ver" != 'cpu' ]]; then - - # - # DistributedDataParallelTest - # All of these seem to fail - tests_to_skip+=('DistributedDataParallelTest') - - # - # RendezvousEnvTest - # Traceback (most recent call last): - # File "test_c10d.py", line 201, in test_nominal - # store0, rank0, size0 = next(gen0) - # File "/opt/python/cp36-cp36m/lib/python3.6/site-packages/torch/distributed/rendezvous.py", line 131, in _env_rendezvous_handler - # store = TCPStore(master_addr, master_port, start_daemon) - # RuntimeError: Address already in use - tests_to_skip+=('RendezvousEnvTest and test_nominal') - - # - # TestCppExtension - # - # Traceback (most recent call last): - # File "test_cpp_extensions.py", line 134, in test_jit_cudnn_extension - # with_cuda=True) - # File "/opt/python/cp35-cp35m/lib/python3.5/site-packages/torch/utils/cpp_extension.py", line 552, in load - # with_cuda) - # File "/opt/python/cp35-cp35m/lib/python3.5/site-packages/torch/utils/cpp_extension.py", line 729, in _jit_compile - # return _import_module_from_library(name, build_directory) - # File "/opt/python/cp35-cp35m/lib/python3.5/site-packages/torch/utils/cpp_extension.py", line 867, in _import_module_from_library - # return imp.load_module(module_name, file, path, description) - # File "/opt/python/cp35-cp35m/lib/python3.5/imp.py", line 243, in load_module - # return load_dynamic(name, filename, file) - # File "/opt/python/cp35-cp35m/lib/python3.5/imp.py", line 343, in load_dynamic - # return _load(spec) - # File "", line 693, in _load - # File "", line 666, in _load_unlocked - # File "", line 577, in module_from_spec - # File "", line 938, in create_module - # File "", line 222, in _call_with_frames_removed - # ImportError: libcudnn.so.7: cannot open shared object file: No such file or directory - tests_to_skip+=('TestCppExtension and test_jit_cudnn_extension') - - # - # TestCuda - # - - # 3.7_cu80 - # RuntimeError: CUDA error: out of memory - tests_to_skip+=('TestCuda and test_arithmetic_large_tensor') - - # 3.7_cu80 - # RuntimeError: cuda runtime error (2) : out of memory at /opt/conda/conda-bld/pytorch-nightly_1538097262541/work/aten/src/THC/THCTensorCopy.cu:205 - tests_to_skip+=('TestCuda and test_autogpu') - - # - # TestDistBackend - # - - # Traceback (most recent call last): - # File "test_thd_distributed.py", line 1046, in wrapper - # self._join_and_reduce(fn) - # File "test_thd_distributed.py", line 1108, in _join_and_reduce - # self.assertEqual(p.exitcode, first_process.exitcode) - # File "/pytorch/test/common.py", line 399, in assertEqual - # super(TestCase, self).assertEqual(x, y, message) - # AssertionError: None != 77 : - tests_to_skip+=('TestDistBackend and test_all_gather_group') - tests_to_skip+=('TestDistBackend and test_all_reduce_group_max') - tests_to_skip+=('TestDistBackend and test_all_reduce_group_min') - tests_to_skip+=('TestDistBackend and test_all_reduce_group_sum') - tests_to_skip+=('TestDistBackend and test_all_reduce_group_product') - tests_to_skip+=('TestDistBackend and test_barrier_group') - tests_to_skip+=('TestDistBackend and test_broadcast_group') - - # Traceback (most recent call last): - # File "test_thd_distributed.py", line 1046, in wrapper - # self._join_and_reduce(fn) - # File "test_thd_distributed.py", line 1108, in _join_and_reduce - # self.assertEqual(p.exitcode, first_process.exitcode) - # File "/pytorch/test/common.py", line 397, in assertEqual - # super(TestCase, self).assertLessEqual(abs(x - y), prec, message) - # AssertionError: 12 not less than or equal to 1e-05 - tests_to_skip+=('TestDistBackend and test_barrier') - - # Traceback (most recent call last): - # File "test_distributed.py", line 1267, in wrapper - # self._join_and_reduce(fn) - # File "test_distributed.py", line 1350, in _join_and_reduce - # self.assertEqual(p.exitcode, first_process.exitcode) - # File "/pytorch/test/common.py", line 399, in assertEqual - # super(TestCase, self).assertEqual(x, y, message) - # AssertionError: None != 1 - tests_to_skip+=('TestDistBackend and test_broadcast') - - # Memory leak very similar to all the conda ones below, but appears on manywheel - # 3.6m_cu80 - # AssertionError: 1605632 not less than or equal to 1e-05 : __main__.TestEndToEndHybridFrontendModels.test_vae_cuda leaked 1605632 bytes CUDA memory on device 0 - tests_to_skip+=('TestEndToEndHybridFrontendModels and test_vae_cuda') - - # ________________________ TestNN.test_embedding_bag_cuda ________________________ - # - # self = - # dtype = torch.float32 - # - # @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") - # @repeat_test_for_types(ALL_TENSORTYPES) - # @skipIfRocm - # def test_embedding_bag_cuda(self, dtype=torch.float): - # self._test_EmbeddingBag(True, 'sum', False, dtype) - # self._test_EmbeddingBag(True, 'mean', False, dtype) - # self._test_EmbeddingBag(True, 'max', False, dtype) - # if dtype != torch.half: - # # torch.cuda.sparse.HalfTensor is not enabled. - # self._test_EmbeddingBag(True, 'sum', True, dtype) - # > self._test_EmbeddingBag(True, 'mean', True, dtype) - # - # test_nn.py:2144: - # _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ - # test_nn.py:2062: in _test_EmbeddingBag - # _test_vs_Embedding(N, D, B, L) - # test_nn.py:2059: in _test_vs_Embedding - # self.assertEqual(es_weight_grad, e.weight.grad, needed_prec) - # common.py:373: in assertEqual - # assertTensorsEqual(x, y) - # common.py:365: in assertTensorsEqual - # self.assertLessEqual(max_err, prec, message) - # E AssertionError: tensor(0.0000, device='cuda:0', dtype=torch.float32) not less than or equal to 2e-05 : - # 1 failed, 1202 passed, 19 skipped, 2 xfailed, 796 warnings in 1166.73 seconds = - # Traceback (most recent call last): - # File "test/run_test.py", line 391, in - # main() - # File "test/run_test.py", line 383, in main - # raise RuntimeError(message) - tests_to_skip+=('TestNN and test_embedding_bag_cuda') -fi - - -########################################################################## -# Conda specific flaky tests -########################################################################## - -# Only on Anaconda's python 2.7 -# So, this doesn't really make sense. All the mac jobs are run on the same -# machine, so the wheel jobs still use conda to silo their python -# installations. The wheel job for Python 2.7 should use the exact same Python -# from conda as the conda job for Python 2.7. Yet, this only appears on the -# conda jobs. -if [[ "$package_type" == 'conda' && "$py_ver" == '2.7' ]]; then - # Traceback (most recent call last): - # File "test_jit.py", line 6281, in test_wrong_return_type - # @torch.jit.script - # File "/Users/administrator/nightlies/2018_09_30/wheel_build_dirs/conda_2.7/conda/envs/env_py2.7_0_20180930/lib/python2.7/site-packages/torch/jit/__init__.py", line 639, in script - # graph = _jit_script_compile(ast, rcb) - # File "/Users/administrator/nightlies/2018_09_30/wheel_build_dirs/conda_2.7/conda/envs/env_py2.7_0_20180930/lib/python2.7/site-packages/torch/jit/annotations.py", line 80, in get_signature - # return parse_type_line(type_line) - # File "/Users/administrator/nightlies/2018_09_30/wheel_build_dirs/conda_2.7/conda/envs/env_py2.7_0_20180930/lib/python2.7/site-packages/torch/jit/annotations.py", line 131, in parse_type_line - # return arg_types, ann_to_type(ret_ann) - # File "/Users/administrator/nightlies/2018_09_30/wheel_build_dirs/conda_2.7/conda/envs/env_py2.7_0_20180930/lib/python2.7/site-packages/torch/jit/annotations.py", line 192, in ann_to_type - # return TupleType([ann_to_type(a) for a in ann.__args__]) - # TypeError: 'TupleInstance' object is not iterable - tests_to_skip+=('TestScript and test_wrong_return_type') -fi - -# Lots of memory leaks on CUDA -if [[ "$package_type" == 'conda' && "$cuda_ver" != 'cpu' ]]; then - - # 3.7_cu92 - # AssertionError: 63488 not less than or equal to 1e-05 : __main__.TestEndToEndHybridFrontendModels.test_mnist_cuda leaked 63488 bytes CUDA memory on device 0 - tests_to_skip+=('TestEndToEndHybridFrontendModels and test_mnist_cuda') - - # 2.7_cu92 - # AssertionError: __main__.TestNN.test_BatchNorm3d_momentum_eval_cuda leaked -1024 bytes CUDA memory on device 0 - tests_to_skip+=('TestNN and test_BatchNorm3d_momentum_eval_cuda') - - # - # All of test_BCE is flaky - tests_to_skip+=('TestNN and test_BCE') - - # 3.5_cu80 - # AssertionError: 3584 not less than or equal to 1e-05 : test_nn.TestNN.test_BCEWithLogitsLoss_cuda_double leaked 3584 bytes CUDA memory on device 0 - #tests_to_skip+=('TestNN and test_BCEWithLogitsLoss_cuda_double') - - # 2.7_cu92 - # AssertionError: __main__.TestNN.test_ConvTranspose2d_cuda leaked -1024 bytes CUDA memory on device 0 - tests_to_skip+=('TestNN and test_ConvTranspose2d_cuda') - - # 3.7_cu90 - # AssertionError: 1024 not less than or equal to 1e-05 : __main__.TestNN.test_ConvTranspose3d_cuda leaked -1024 bytes CUDA memory on device 0 - tests_to_skip+=('TestNN and test_ConvTranspose3d_cuda') - - # - # - # CTCLoss - # These are all flaky - tests_to_skip+=('TestNN and test_CTCLoss') - - # 2.7_cu90 - # 2.7_cu92 - # 3.5_cu90 x2 - # 3.6_cu90 - # 3.7_cu80 x3 - # 3.7_cu90 - # AssertionError: 37376 not less than or equal to 1e-05 : __main__.TestNN.test_CTCLoss_1d_target_cuda_double leaked 37376 bytes CUDA memory on device 0 - #tests_to_skip+=('TestNN and test_CTCLoss_1d_target_cuda_double') - - # 2.7_cu80 --18944 - # 2.7_cu92 - # 3.5_cu90 --18944 x2 - # 3.5_cu92 --18944 x2 - # 3.6_cu90 --18944 - # 3.6_cu92 --18944 - # 3.7_cu80 - # AssertionError: 37376 not less than or equal to 1e-05 : __main__.TestNN.test_CTCLoss_1d_target_cuda_float leaked -37376 bytes CUDA memory on device 0 - #tests_to_skip+=('TestNN and test_CTCLoss_1d_target_cuda_float') - - # 3.5_cu90 - # 3.7_cu92 - # AssertionError: 37376 not less than or equal to 1e-05 : __main__.TestNN.test_CTCLoss_1d_target_sum_reduction_cuda_double leaked 37376 bytes CUDA memory on device 0 - #tests_to_skip+=('TestNN and test_CTCLoss_1d_target_sum_reduction_cuda_double') - - # 3.7_cu92 - # AssertionError: 18432 not less than or equal to 1e-05 : __main__.TestNN.test_CTCLoss_1d_target_sum_reduction_cuda_float leaked -18432 bytes CUDA memory on device 0 - #tests_to_skip+=('TestNN and test_CTCLoss_1d_target_sum_reduction_cuda_float') - - # 3.5_cu92 x2 - # 3.6_cu80 - # 3.7_cu90 - # AssertionError: AssertionError: 37376 not less than or equal to 1e-05 : __main__.TestNN.test_CTCLoss_2d_int_target_cuda_double leaked 37376 bytes CUDA memory on device 0 - #tests_to_skip+=('TestNN and test_CTCLoss_2d_int_target_cuda_double') - - # 3.5_cu92 - # 3.6_cu80 --37376 - # 3.6_cu92 - # AssertionError: 18944 not less than or equal to 1e-05 : __main__.TestNN.test_CTCLoss_2d_int_target_cuda_float leaked 18944 bytes CUDA memory on device 0 - #tests_to_skip+=('TestNN and test_CTCLoss_2d_int_target_cuda_float') - - # 2.7_cu90 - # 3.5_cu80 - # 3.7_cu80 x2 - # AssertionError: 37376 not less than or equal to 1e-05 : __main__.TestNN.test_CTCLoss_2d_int_target_sum_reduction_cuda_double leaked 37376 bytes CUDA memory on device 0 - #tests_to_skip+=('TestNN and test_CTCLoss_2d_int_target_sum_reduction_cuda_double') - - # 2.7_cu90 - # 2.7_cu92 --18944 - # AssertionError: __main__.TestNN.test_CTCLoss_2d_int_target_sum_reduction_cuda_float leaked -37376 bytes CUDA memory on device 0 - #tests_to_skip+=('TestNN and test_CTCLoss_2d_int_target_sum_reduction_cuda_float') - - # 2.7_cu92 - # AssertionError: __main__.TestNN.test_CTCLoss_cuda_double leaked 37376 bytes CUDA memory on device 0 - #tests_to_skip+=('TestNN and test_CTCLoss_cuda_double') - - # 2.7_cu92 - # AssertionError: __main__.TestNN.test_CTCLoss_cuda_float leaked 18944 bytes CUDA memory on device 0 - #tests_to_skip+=('TestNN and test_CTCLoss_cuda_float') - - # 2.7_cu92 - # 3.5_cu90 x2 - # 3.5_cu92 - # 3.5_cu92 - # 3.6_cu80 x2 - # AssertionError: 37376 not less than or equal to 1e-05 : __main__.TestNN.test_CTCLoss_sum_reduction_cuda_double leaked 37376 bytes CUDA memory on device 0 - #tests_to_skip+=('TestNN and test_CTCLoss_sum_reduction_cuda_double') - - # 2.7_cu92 --18944 - # 3.6_cu80 - # AssertionError: 37376 not less than or equal to 1e-05 : __main__.TestNN.test_CTCLoss_sum_reduction_cuda_float leaked -37376 bytes CUDA memory on device 0 - #tests_to_skip+=('TestNN and test_CTCLoss_sum_reduction_cuda_float') - - # - # - # NLLLoss - # These are all flaky - tests_to_skip+=('TestNN and NLLLoss') - - # 3.5_cu90 x2 - # AssertionError: 3584 not less than or equal to 1e-05 : __main__.TestNN.test_NLLLoss_2d_cuda_double leaked 3584 bytes CUDA memory on device 0 - #tests_to_skip+=('TestNN and test_NLLLoss_2d_cuda_double') - - # 2.7_cu80 - # AssertionError: __main__.TestNN.test_NLLLoss_2d_cuda_float leaked 2560 bytes CUDA memory on device 0 - #tests_to_skip+=('TestNN and test_NLLLoss_2d_cuda_float') - - # 2.7_cu80 - # 2.7_cu92 - # 3.6_cu80 x2 - # AssertionError: 1536 not less than or equal to 1e-05 : __main__.TestNN.test_NLLLoss_2d_cuda_half leaked 1536 bytes CUDA memory on device 0 - #tests_to_skip+=('TestNN and test_NLLLoss_2d_cuda_half') - - # 2.7_cu90 - # 3.6_cu80 x2 - # 3.6_cu90 - # 3.6_cu92 - # AssertionError: 3584 not less than or equal to 1e-05 : __main__.TestNN.test_NLLLoss_2d_ignore_index_cuda_double leaked 3584 bytes CUDA memory on device 0 - #tests_to_skip+=('TestNN and test_NLLLoss_2d_ignore_index_cuda_double') - - # 3.6_cu80 x2 - # 3.6_cu90 - # AssertionError: 3584 not less than or equal to 1e-05 : __main__.TestNN.test_NLLLoss_2d_ignore_index_cuda_float leaked -3584 bytes CUDA memory on device 0 - #tests_to_skip+=('TestNN and test_NLLLoss_2d_ignore_index_cuda_float') - - # 3.6_cu90 - # AssertionError: 3584 not less than or equal to 1e-05 : test_nn.TestNN.test_NLLLoss_2d_weights_cuda_double leaked 3584 bytes CUDA memory on device 0 - #tests_to_skip+=('TestNN and test_NLLLoss_2d_ignore_index_cuda_half') - - # 3.6_cu80 - # AssertionError: 3584 not less than or equal to 1e-05 : __main__.TestNN.test_NLLLoss_2d_sum_reduction_cuda_double leaked 3584 bytes CUDA memory on device 0 - #tests_to_skip+=('TestNN and test_NLLLoss_2d_sum_reduction_cuda_double') - - # 3.6_cu80 - # AssertionError: 2560 not less than or equal to 1e-05 : __main__.TestNN.test_NLLLoss_2d_sum_reduction_cuda_float leaked 2560 bytes CUDA memory on device 0 - #tests_to_skip+=('TestNN and test_NLLLoss_2d_sum_reduction_cuda_float') - - # 3.7_cu92 - # AssertionError: 1536 not less than or equal to 1e-05 : test_nn.TestNN.test_NLLLoss_2d_weights_cuda_half leaked 1536 bytes CUDA memory on device 0 - #tests_to_skip+=('TestNN and test_NLLLoss_2d_weights_cuda_half') - - # 3.6_cu80 - # AssertionError: 1536 not less than or equal to 1e-05 : __main__.TestNN.test_NLLLoss_2d_sum_reduction_cuda_half leaked 1536 bytes CUDA memory on device 0 - #tests_to_skip+=('TestNN and test_NLLLoss_2d_sum_reduction_cuda_half') - - # 2.7_cu92 - # AssertionError: __main__.TestNN.test_NLLLoss_2d_weights_cuda_float leaked 2560 bytes CUDA memory on device 0 - #tests_to_skip+=('TestNN and test_NLLLoss_2d_weights_cuda_float') - - # 3.5_cu80 x2 - # 3.6_cu90 - # AssertionError: 1536 not less than or equal to 1e-05 : __main__.TestNN.test_NLLLoss_dim_is_3_cuda_double leaked 1536 bytes CUDA memory on device 0 - #tests_to_skip+=('TestNN and test_NLLLoss_dim_is_3_cuda_double') - - # 3.6_cu80 - # AssertionError: 1536 not less than or equal to 1e-05 : __main__.TestNN.test_NLLLoss_dim_is_3_sum_reduction_cuda_float leaked 1536 bytes CUDA memory on device 0 - #tests_to_skip+=('TestNN and test_NLLLoss_dim_is_3_sum_reduction_cuda_float') - - # 3.6_cu80 - # 3.7_cu80 x2 - # AssertionError: 1536 not less than or equal to 1e-05 : __main__.TestNN.test_NLLLoss_dim_is_3_sum_reduction_cuda_half leaked 1536 bytes CUDA memory on device 0 - #tests_to_skip+=('TestNN and test_NLLLoss_dim_is_3_sum_reduction_cuda_half') - - # 3.5_cu80 - # 3.7_cu80 x2 - # AssertionError: 10752 not less than or equal to 1e-05 : __main__.TestNN.test_NLLLoss_higher_dim_cuda_double leaked 10752 bytes CUDA memory on device 0 - #tests_to_skip+=('TestNN and test_NLLLoss_higher_dim_cuda_double') - - # 3.5_cu80 - # 3.7_cu80 --10752 x2 - # AssertionError: 5120 not less than or equal to 1e-05 : __main__.TestNN.test_NLLLoss_higher_dim_cuda_float leaked -5120 bytes CUDA memory on device 0 - #tests_to_skip+=('TestNN and test_NLLLoss_higher_dim_cuda_float') - - # 3.5_cu80 - # 3.5 cu90 - # AssertionError: 3584 not less than or equal to 1e-05 : __main__.TestNN.test_NLLLoss_higher_dim_cuda_half leaked 3584 bytes CUDA memory on device 0 - #tests_to_skip+=('TestNN and test_NLLLoss_higher_dim_cuda_half') - - # 3.5_cu90 - # AssertionError: 10752 not less than or equal to 1e-05 : __main__.TestNN.test_NLLLoss_higher_dim_sum_reduction_cuda_double leaked 10752 bytes CUDA memory on device 0 - #tests_to_skip+=('TestNN and test_NLLLoss_higher_dim_sum_reduction_cuda_double') - - # 3.5_cu90 - # AssertionError: 5120 not less than or equal to 1e-05 : __main__.TestNN.test_NLLLoss_higher_dim_sum_reduction_cuda_float leaked -5120 bytes CUDA memory on device 0 - #tests_to_skip+=('TestNN and test_NLLLoss_higher_dim_sum_reduction_cuda_float') - - # ______________________ TestNN.test_variable_sequence_cuda ______________________ - # common_utils.py:277: in wrapper - # method(*args, **kwargs) - # common_utils.py:241: in __exit__ - # self.name, after - before, i)) - # common_utils.py:399: in assertEqual - # super(TestCase, self).assertLessEqual(abs(x - y), prec, message) - # E AssertionError: 1024 not less than or equal to 1e-05 : test_nn.TestNN.test_variable_sequence_cuda leaked 1024 bytes CUDA memory on device 0 - tests_to_skip+=('TestNN and test_variable_sequence_cuda') - - # 3.7_cu90 - # AssertionError: 1024 not less than or equal to 1e-05 : __main__.TestJit.test_fuse_last_device_cuda leaked 1024 bytes CUDA memory on device 1 - tests_to_skip+=('TestJit and test_fuse_last_device_cuda') - - # 3.7_cu92 x2 - # AssertionError: 1024 not less than or equal to 1e-05 : __main__.TestJit.test_ge_cuda leaked 1024 bytes CUDA memory on device 0 - tests_to_skip+=('TestJit and test_ge_cuda') - - # 3.5_cu90 - # AssertionError: 1024 not less than or equal to 1e-05 : test_jit.TestJit.test_comparison_ge_le_cuda leaked -1024 bytes CUDA memory on device 0 - tests_to_skip+=('TestJit and test_comparison_ge_le_cuda') - - # 3.6_cu92 - # 3.7_cu92 - # AssertionError: 1024 not less than or equal to 1e-05 : __main__.TestJit.test_relu_cuda leaked 1024 bytes CUDA memory on device 0 - tests_to_skip+=('TestJit and test_relu_cuda') - - # 3.7_cu92 x3 - # AssertionError: 1024 not less than or equal to 1e-05 : __main__.TestScript.test_milstm_fusion_cuda leaked 1024 bytes CUDA memory on device 1 - tests_to_skip+=('TestScript and test_milstm_fusion_cuda') -fi - - -############################################################################## -# MacOS specific flaky tests -############################################################################## - -if [[ "$(uname)" == 'Darwin' ]]; then - # TestCppExtensions by default uses a temp folder in /tmp. This doesn't - # work for this Mac machine cause there is only one machine and /tmp is - # shared. (All the linux builds are on docker so have their own /tmp). - tests_to_skip+=('TestCppExtension') -fi - -if [[ "$(uname)" == 'Darwin' && "$package_type" == 'conda' ]]; then - - # - # TestDistBackend - # Seems like either most of the Mac builds get this error or none of them - # do - # - - # Traceback (most recent call last): - # File "test_thd_distributed.py", line 1046, in wrapper - # self._join_and_reduce(fn) - # File "test_thd_distributed.py", line 1120, in _join_and_reduce - # first_process.exitcode == SKIP_IF_SMALL_WORLDSIZE_EXIT_CODE - # AssertionError - tests_to_skip+=('TestDistBackend and test_reduce_group_max') - - # Traceback (most recent call last): - # File "test_thd_distributed.py", line 1046, in wrapper - # self._join_and_reduce(fn) - # File "test_thd_distributed.py", line 1132, in _join_and_reduce - # self.assertEqual(first_process.exitcode, 0) - # File "/Users/administrator/nightlies/2018_10_01/wheel_build_dirs/conda_2.7/pytorch/test/common.py", line 397, in assertEqual - # super(TestCase, self).assertLessEqual(abs(x - y), prec, message) - # AssertionError: 1 not less than or equal to 1e-05 - tests_to_skip+=('TestDistBackend and test_isend') - tests_to_skip+=('TestDistBackend and test_reduce_group_min') - tests_to_skip+=('TestDistBackend and test_reduce_max') - tests_to_skip+=('TestDistBackend and test_reduce_min') - tests_to_skip+=('TestDistBackend and test_reduce_group_max') - tests_to_skip+=('TestDistBackend and test_reduce_group_min') - tests_to_skip+=('TestDistBackend and test_reduce_max') - tests_to_skip+=('TestDistBackend and test_reduce_min') - tests_to_skip+=('TestDistBackend and test_reduce_product') - tests_to_skip+=('TestDistBackend and test_reduce_sum') - tests_to_skip+=('TestDistBackend and test_scatter') - tests_to_skip+=('TestDistBackend and test_send_recv') - tests_to_skip+=('TestDistBackend and test_send_recv_any_source') -fi - - -# Turn the set of tests to skip into an invocation that pytest understands -excluded_tests_logic='' -for exclusion in "${tests_to_skip[@]}"; do - if [[ -z "$excluded_tests_logic" ]]; then - # Only true for i==0 - excluded_tests_logic="not ($exclusion)" - else - excluded_tests_logic="$excluded_tests_logic and not ($exclusion)" - fi -done - - -############################################################################## -# Run the tests -############################################################################## -echo -echo "$(date) :: Calling 'python test/run_test.py -v -p pytest ${entire_file_exclusions[@]} -- --disable-pytest-warnings -k '$excluded_tests_logic'" - -python test/run_test.py -v -p pytest ${entire_file_exclusions[@]} -- --disable-pytest-warnings -k "'" "$excluded_tests_logic" "'" - -echo -echo "$(date) :: Finished 'python test/run_test.py -v -p pytest ${entire_file_exclusions[@]} -- --disable-pytest-warnings -k '$excluded_tests_logic'" - -# cpp_extensions don't work with pytest, so we run them without pytest here, -# except there's a failure on CUDA builds (documented above), and -# cpp_extensions doesn't work on a shared mac machine (also documented above) -if [[ "$cuda_ver" == 'cpu' && "$(uname)" != 'Darwin' ]]; then - echo - echo "$(date) :: Calling 'python test/run_test.py -v -i cpp_extensions'" - python test/run_test.py -v -i cpp_extensions - echo - echo "$(date) :: Finished 'python test/run_test.py -v -i cpp_extensions'" -fi - -# thd_distributed can run on Mac but not in pytest -if [[ "$(uname)" == 'Darwin' ]]; then - echo - echo "$(date) :: Calling 'python test/run_test.py -v -i thd_distributed'" - python test/run_test.py -v -i thd_distributed - echo - echo "$(date) :: Finished 'python test/run_test.py -v -i thd_distributed'" -fi diff --git a/windows/internal/cuda_install.bat b/windows/internal/cuda_install.bat index 2bfec067a..eb0ccac68 100644 --- a/windows/internal/cuda_install.bat +++ b/windows/internal/cuda_install.bat @@ -10,9 +10,7 @@ set SRC_DIR=%~dp0\.. if not exist "%SRC_DIR%\temp_build" mkdir "%SRC_DIR%\temp_build" set /a CUDA_VER=%CUDA_VERSION% -set CUDA_VER_MAJOR=%CUDA_VERSION:~0,-1% -set CUDA_VER_MINOR=%CUDA_VERSION:~-1,1% -set CUDA_VERSION_STR=%CUDA_VER_MAJOR%.%CUDA_VER_MINOR% +set /a CUDA_VERSION_STR=%CUDA_VERSION% set CUDNN_FOLDER="cuda" set CUDNN_LIB_FOLDER="lib\x64" @@ -228,5 +226,4 @@ goto set_cuda_env_vars echo Setting up environment... set "PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin;%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\libnvvp;%PATH%" set "CUDA_PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%" -set "CUDA_PATH_V%CUDA_VER_MAJOR%_%CUDA_VER_MINOR%=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%" set "NVTOOLSEXT_PATH=%ProgramFiles%\NVIDIA Corporation\NvToolsExt"