apiVersion: kubeflow.org/v1alpha2 kind: MPIJob metadata: name: mlagents-horovod-test spec: slotsPerWorker: 1 cleanPodPolicy: Running mpiReplicaSpecs: Launcher: replicas: 1 template: spec: containers: - image: gcr.io/unity-ml-agents-expts-test/mlagents-horovod:latest name: mlagents-horovod-test resources: limits: cpu: 4 command: ["/bin/sh", "-c"] args: [" mpirun --allow-run-as-root -np 8 -bind-to none -map-by slot -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH -x HOROVOD_AUTOTUNE=1 -x PATH -mca pml ob1 -mca btl ^openib mlagents-learn /unity-volume/trainer_config.yaml --run-id=snoopydfd-ppo-8m --train --env=/unity-volume/SnoopyPop15Levels_dfd --num-envs=6; mpirun --allow-run-as-root -np 8 -bind-to none -map-by slot -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH -x HOROVOD_AUTOTUNE=1 -x PATH -mca pml ob1 -mca btl ^openib gsutil cp -r models gs://ray-volume/horovod/; mpirun --allow-run-as-root -np 8 -bind-to none -map-by slot -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH -x HOROVOD_AUTOTUNE=1 -x PATH -mca pml ob1 -mca btl ^openib gsutil cp -r summaries gs://ray-volume/horovod/; "] Worker: replicas: 8 template: spec: containers: - image: gcr.io/unity-ml-agents-expts-test/mlagents-horovod:latest name: mlagents-horovod-test resources: limits: cpu: 2 nvidia.com/gpu: 1 memory: 8G - image: gcr.io/unity-ml-agents-expts-test/mlagents-horovod-env:latest name: mlagents-horovod-test-env resources: requests: cpu: 12 memory: 48G limits: cpu: 12 memory: 48G command: ["/bin/sh", "-c"] args: [" sleep 45s; cd /unity-volume; (xvfb-run --auto-servernum --server-args='-screen 0 640x480x24' /unity-volume/SnoopyPop15Levels_dfd.x86_64 --port 5005 &); (xvfb-run --auto-servernum --server-args='-screen 0 640x480x24' /unity-volume/SnoopyPop15Levels_dfd.x86_64 --port 5006 &); (xvfb-run --auto-servernum --server-args='-screen 0 640x480x24' /unity-volume/SnoopyPop15Levels_dfd.x86_64 --port 5007 &); (xvfb-run --auto-servernum --server-args='-screen 0 640x480x24' /unity-volume/SnoopyPop15Levels_dfd.x86_64 --port 5008 &); (xvfb-run --auto-servernum --server-args='-screen 0 640x480x24' /unity-volume/SnoopyPop15Levels_dfd.x86_64 --port 5009 &); xvfb-run --auto-servernum --server-args='-screen 0 640x480x24' /unity-volume/SnoopyPop15Levels_dfd.x86_64 --port 5010 "] stdin: true tty: true securityContext: privileged: true capabilities: add: - SYS_ADMIN