您最多选择25个主题
主题必须以中文或者字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符
64 行
3.0 KiB
64 行
3.0 KiB
apiVersion: kubeflow.org/v1alpha2
|
|
kind: MPIJob
|
|
metadata:
|
|
name: mlagents-horovod-test
|
|
spec:
|
|
slotsPerWorker: 1
|
|
cleanPodPolicy: Running
|
|
mpiReplicaSpecs:
|
|
Launcher:
|
|
replicas: 1
|
|
template:
|
|
spec:
|
|
containers:
|
|
- image: gcr.io/unity-ml-agents-expts-test/mlagents-horovod:latest
|
|
name: mlagents-horovod-test
|
|
resources:
|
|
limits:
|
|
cpu: 4
|
|
command: ["/bin/sh", "-c"]
|
|
args: ["
|
|
mpirun --allow-run-as-root -np 8 -bind-to none -map-by slot -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH -x HOROVOD_AUTOTUNE=1 -x PATH -mca pml ob1 -mca btl ^openib mlagents-learn /unity-volume/trainer_config.yaml --run-id=snoopydfd-ppo-8m --train --env=/unity-volume/SnoopyPop15Levels_dfd --num-envs=6;
|
|
mpirun --allow-run-as-root -np 8 -bind-to none -map-by slot -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH -x HOROVOD_AUTOTUNE=1 -x PATH -mca pml ob1 -mca btl ^openib gsutil cp -r models gs://ray-volume/horovod/;
|
|
mpirun --allow-run-as-root -np 8 -bind-to none -map-by slot -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH -x HOROVOD_AUTOTUNE=1 -x PATH -mca pml ob1 -mca btl ^openib gsutil cp -r summaries gs://ray-volume/horovod/;
|
|
"]
|
|
|
|
Worker:
|
|
replicas: 8
|
|
template:
|
|
spec:
|
|
containers:
|
|
- image: gcr.io/unity-ml-agents-expts-test/mlagents-horovod:latest
|
|
name: mlagents-horovod-test
|
|
resources:
|
|
limits:
|
|
cpu: 2
|
|
nvidia.com/gpu: 1
|
|
memory: 8G
|
|
- image: gcr.io/unity-ml-agents-expts-test/mlagents-horovod-env:latest
|
|
name: mlagents-horovod-test-env
|
|
resources:
|
|
requests:
|
|
cpu: 12
|
|
memory: 48G
|
|
limits:
|
|
cpu: 12
|
|
memory: 48G
|
|
command: ["/bin/sh", "-c"]
|
|
args: ["
|
|
sleep 45s;
|
|
cd /unity-volume;
|
|
(xvfb-run --auto-servernum --server-args='-screen 0 640x480x24' /unity-volume/SnoopyPop15Levels_dfd.x86_64 --port 5005 &);
|
|
(xvfb-run --auto-servernum --server-args='-screen 0 640x480x24' /unity-volume/SnoopyPop15Levels_dfd.x86_64 --port 5006 &);
|
|
(xvfb-run --auto-servernum --server-args='-screen 0 640x480x24' /unity-volume/SnoopyPop15Levels_dfd.x86_64 --port 5007 &);
|
|
(xvfb-run --auto-servernum --server-args='-screen 0 640x480x24' /unity-volume/SnoopyPop15Levels_dfd.x86_64 --port 5008 &);
|
|
(xvfb-run --auto-servernum --server-args='-screen 0 640x480x24' /unity-volume/SnoopyPop15Levels_dfd.x86_64 --port 5009 &);
|
|
xvfb-run --auto-servernum --server-args='-screen 0 640x480x24' /unity-volume/SnoopyPop15Levels_dfd.x86_64 --port 5010
|
|
"]
|
|
stdin: true
|
|
tty: true
|
|
securityContext:
|
|
privileged: true
|
|
capabilities:
|
|
add:
|
|
- SYS_ADMIN
|