FROM nvidia/cudagl:9.0-devel-ubuntu16.04 # TensorFlow version is tightly coupled to CUDA and cuDNN so it should be selected carefully ENV TENSORFLOW_VERSION=1.12.0 ENV PYTORCH_VERSION=1.1.0 ENV TORCHVISION_VERSION=0.2.2.post3 ENV CUDNN_VERSION=7.4.1.5-1+cuda9.0 ENV NCCL_VERSION=2.3.7-1+cuda9.0 # Set default shell to /bin/bash SHELL ["/bin/bash", "-cu"] # ensure local python is preferred over distribution python ENV PATH /usr/local/bin:$PATH # http://bugs.python.org/issue19846 # > At the moment, setting "LANG=C" on a Linux system *fundamentally breaks Python 3*, and that's not OK. ENV LANG C.UTF-8 # runtime dependencies RUN apt-get update && apt-get install -y --no-install-recommends \ ca-certificates \ libexpat1 \ libffi6 \ libgdbm3 \ libreadline6 \ libsqlite3-0 \ libssl1.0.0 \ && rm -rf /var/lib/apt/lists/* ENV GPG_KEY 0D96DF4D4110E5C43FBFB17F2D347EA6AA65421D ENV PYTHON_VERSION 3.6.4 RUN set -ex \ && buildDeps=" \ dpkg-dev \ gcc \ libbz2-dev \ libc6-dev \ libexpat1-dev \ libffi-dev \ libgdbm-dev \ liblzma-dev \ libncursesw5-dev \ libreadline-dev \ libsqlite3-dev \ libssl-dev \ make \ tcl-dev \ tk-dev \ wget \ xz-utils \ zlib1g-dev \ # as of Stretch, "gpg" is no longer included by default $(command -v gpg > /dev/null || echo 'gnupg dirmngr') \ " \ && apt-get update && apt-get install -y $buildDeps --no-install-recommends && rm -rf /var/lib/apt/lists/* \ \ && wget -O python.tar.xz "https://www.python.org/ftp/python/${PYTHON_VERSION%%[a-z]*}/Python-$PYTHON_VERSION.tar.xz" \ && wget -O python.tar.xz.asc "https://www.python.org/ftp/python/${PYTHON_VERSION%%[a-z]*}/Python-$PYTHON_VERSION.tar.xz.asc" \ && export GNUPGHOME="$(mktemp -d)" \ && gpg --keyserver ha.pool.sks-keyservers.net --recv-keys "$GPG_KEY" \ && gpg --batch --verify python.tar.xz.asc python.tar.xz \ && rm -rf "$GNUPGHOME" python.tar.xz.asc \ && mkdir -p /usr/src/python \ && tar -xJC /usr/src/python --strip-components=1 -f python.tar.xz \ && rm python.tar.xz \ \ && cd /usr/src/python \ && gnuArch="$(dpkg-architecture --query DEB_BUILD_GNU_TYPE)" \ && ./configure \ --build="$gnuArch" \ --enable-loadable-sqlite-extensions \ --enable-shared \ --with-system-expat \ --with-system-ffi \ --without-ensurepip \ && make -j "$(nproc)" \ && make install \ && ldconfig \ \ && apt-get purge -y --auto-remove $buildDeps \ \ && find /usr/local -depth \ \( \ \( -type d -a \( -name test -o -name tests \) \) \ -o \ \( -type f -a \( -name '*.pyc' -o -name '*.pyo' \) \) \ \) -exec rm -rf '{}' + \ && rm -rf /usr/src/python # make some useful symlinks that are expected to exist RUN cd /usr/local/bin \ && ln -s idle3 idle \ && ln -s pydoc3 pydoc \ && ln -s python3 python \ && ln -s python3-config python-config # if this is called "PIP_VERSION", pip explodes with "ValueError: invalid truth value ''" ENV PYTHON_PIP_VERSION 9.0.3 RUN set -ex; \ \ apt-get update; \ apt-get install -y --no-install-recommends wget; \ rm -rf /var/lib/apt/lists/*; \ \ wget -O get-pip.py 'https://bootstrap.pypa.io/get-pip.py'; \ \ apt-get purge -y --auto-remove wget; \ \ python get-pip.py \ --disable-pip-version-check \ --no-cache-dir \ "pip==$PYTHON_PIP_VERSION" \ ; \ pip --version; \ \ find /usr/local -depth \ \( \ \( -type d -a \( -name test -o -name tests \) \) \ -o \ \( -type f -a \( -name '*.pyc' -o -name '*.pyo' \) \) \ \) -exec rm -rf '{}' +; \ rm -f get-pip.py RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \ build-essential \ cmake \ g++-4.8 \ git \ curl \ vim \ wget \ ca-certificates \ libcudnn7=${CUDNN_VERSION} \ libnccl2=${NCCL_VERSION} \ libnccl-dev=${NCCL_VERSION} \ libjpeg-dev \ libpng-dev \ librdmacm1 \ libibverbs1 \ libibverbs-dev \ xvfb # Install TensorFlow RUN pip install future typing RUN pip install numpy \ tensorflow-gpu==${TENSORFLOW_VERSION} \ keras \ h5py # Install Open MPI RUN mkdir /tmp/openmpi && \ cd /tmp/openmpi && \ wget https://www.open-mpi.org/software/ompi/v4.0/downloads/openmpi-4.0.0.tar.gz && \ tar zxf openmpi-4.0.0.tar.gz && \ cd openmpi-4.0.0 && \ ./configure --enable-orterun-prefix-by-default && \ make -j $(nproc) all && \ make install && \ ldconfig && \ rm -rf /tmp/openmpi # Install Horovod, temporarily using CUDA stubs RUN ldconfig /usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs && \ HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir horovod && \ ldconfig # Install OpenSSH for MPI to communicate between containers RUN apt-get install -y --no-install-recommends openssh-client openssh-server && \ mkdir -p /var/run/sshd # Allow OpenSSH to talk to containers without asking for confirmation RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config # Install ml-agents-envs package locally COPY ml-agents-envs /ml-agents-envs WORKDIR /ml-agents-envs RUN pip install -e . # Install ml-agents package next COPY ml-agents /ml-agents WORKDIR /ml-agents RUN pip install -e . # setup google-cloud-sdk, which is used to copy files to gcs after the training is finished RUN apt-get install --yes --no-install-recommends \ ca-certificates \ curl \ && echo "deb http://packages.cloud.google.com/apt cloud-sdk-xenial main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list \ && curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add - \ && apt-get update \ && apt-get install --yes google-cloud-sdk \ && apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* COPY unity-volume /unity-volume RUN chmod +x /unity-volume/*.x86_64 CMD ["/bin/bash"]