比较提交
合并到: unity-tech-cn:main
unity-tech-cn:/main
unity-tech-cn:/develop-generalizationTraining-TrainerController
unity-tech-cn:/tag-0.2.0
unity-tech-cn:/tag-0.2.1
unity-tech-cn:/tag-0.2.1a
unity-tech-cn:/tag-0.2.1c
unity-tech-cn:/tag-0.2.1d
unity-tech-cn:/hotfix-v0.9.2a
unity-tech-cn:/develop-gpu-test
unity-tech-cn:/0.10.1
unity-tech-cn:/develop-pyinstaller
unity-tech-cn:/develop-horovod
unity-tech-cn:/PhysXArticulations20201
unity-tech-cn:/importdocfix
unity-tech-cn:/develop-resizetexture
unity-tech-cn:/hh-develop-walljump_bugfixes
unity-tech-cn:/develop-walljump-fix-sac
unity-tech-cn:/hh-develop-walljump_rnd
unity-tech-cn:/tag-0.11.0.dev0
unity-tech-cn:/develop-pytorch
unity-tech-cn:/tag-0.11.0.dev2
unity-tech-cn:/develop-newnormalization
unity-tech-cn:/tag-0.11.0.dev3
unity-tech-cn:/develop
unity-tech-cn:/release-0.12.0
unity-tech-cn:/tag-0.12.0-dev
unity-tech-cn:/tag-0.12.0.dev0
unity-tech-cn:/tag-0.12.1
unity-tech-cn:/2D-explorations
unity-tech-cn:/asymm-envs
unity-tech-cn:/tag-0.12.1.dev0
unity-tech-cn:/2D-exploration-raycast
unity-tech-cn:/tag-0.12.1.dev1
unity-tech-cn:/release-0.13.0
unity-tech-cn:/release-0.13.1
unity-tech-cn:/plugin-proof-of-concept
unity-tech-cn:/release-0.14.0
unity-tech-cn:/hotfix-bump-version-master
unity-tech-cn:/soccer-fives
unity-tech-cn:/release-0.14.1
unity-tech-cn:/bug-failed-api-check
unity-tech-cn:/test-recurrent-gail
unity-tech-cn:/hh-add-icons
unity-tech-cn:/release-0.15.0
unity-tech-cn:/release-0.15.1
unity-tech-cn:/hh-develop-all-posed-characters
unity-tech-cn:/internal-policy-ghost
unity-tech-cn:/distributed-training
unity-tech-cn:/hh-develop-improve_tennis
unity-tech-cn:/test-tf-ver
unity-tech-cn:/release_1_branch
unity-tech-cn:/tennis-time-horizon
unity-tech-cn:/whitepaper-experiments
unity-tech-cn:/r2v-yamato-linux
unity-tech-cn:/docs-update
unity-tech-cn:/release_2_branch
unity-tech-cn:/exp-mede
unity-tech-cn:/sensitivity
unity-tech-cn:/release_2_verified_load_fix
unity-tech-cn:/test-sampler
unity-tech-cn:/release_2_verified
unity-tech-cn:/hh-develop-ragdoll-testing
unity-tech-cn:/origin-develop-taggedobservations
unity-tech-cn:/MLA-1734-demo-provider
unity-tech-cn:/sampler-refactor-copy
unity-tech-cn:/PhysXArticulations20201Package
unity-tech-cn:/tag-com.unity.ml-agents_1.0.8
unity-tech-cn:/release_3_branch
unity-tech-cn:/github-actions
unity-tech-cn:/release_3_distributed
unity-tech-cn:/fix-batch-tennis
unity-tech-cn:/distributed-ppo-sac
unity-tech-cn:/gridworld-custom-obs
unity-tech-cn:/hw20-segmentation
unity-tech-cn:/hh-develop-gamedev-demo
unity-tech-cn:/active-variablespeed
unity-tech-cn:/release_4_branch
unity-tech-cn:/fix-env-step-loop
unity-tech-cn:/release_5_branch
unity-tech-cn:/fix-walker
unity-tech-cn:/release_6_branch
unity-tech-cn:/hh-32-observation-crawler
unity-tech-cn:/trainer-plugin
unity-tech-cn:/hh-develop-max-steps-demo-recorder
unity-tech-cn:/hh-develop-loco-walker-variable-speed
unity-tech-cn:/exp-0002
unity-tech-cn:/experiment-less-max-step
unity-tech-cn:/hh-develop-hallway-wall-mesh-fix
unity-tech-cn:/release_7_branch
unity-tech-cn:/exp-vince
unity-tech-cn:/hh-develop-gridsensor-tests
unity-tech-cn:/tag-release_8_test0
unity-tech-cn:/tag-release_8_test1
unity-tech-cn:/release_8_branch
unity-tech-cn:/docfix-end-episode
unity-tech-cn:/release_9_branch
unity-tech-cn:/hybrid-action-rewardsignals
unity-tech-cn:/MLA-462-yamato-win
unity-tech-cn:/exp-alternate-atten
unity-tech-cn:/hh-develop-fps_game_project
unity-tech-cn:/fix-conflict-base-env
unity-tech-cn:/release_10_branch
unity-tech-cn:/exp-bullet-hell-trainer
unity-tech-cn:/ai-summit-exp
unity-tech-cn:/comms-grad
unity-tech-cn:/walljump-pushblock
unity-tech-cn:/goal-conditioning
unity-tech-cn:/release_11_branch
unity-tech-cn:/hh-develop-water-balloon-fight
unity-tech-cn:/gc-hyper
unity-tech-cn:/layernorm
unity-tech-cn:/yamato-linux-debug-venv
unity-tech-cn:/soccer-comms
unity-tech-cn:/hh-develop-pushblockcollab
unity-tech-cn:/release_12_branch
unity-tech-cn:/fix-get-step-sp-curr
unity-tech-cn:/continuous-comms
unity-tech-cn:/no-comms
unity-tech-cn:/hh-develop-zombiepushblock
unity-tech-cn:/hypernetwork
unity-tech-cn:/revert-4859-develop-update-readme
unity-tech-cn:/sequencer-env-attention
unity-tech-cn:/hh-develop-variableobs
unity-tech-cn:/exp-tanh
unity-tech-cn:/reward-dist
unity-tech-cn:/exp-weight-decay
unity-tech-cn:/exp-robot
unity-tech-cn:/bullet-hell-barracuda-test-1.3.1
unity-tech-cn:/release_13_branch
unity-tech-cn:/release_14_branch
unity-tech-cn:/exp-clipped-gaussian-entropy
unity-tech-cn:/tic-tac-toe
unity-tech-cn:/hh-develop-dodgeball
unity-tech-cn:/repro-vis-obs-perf
unity-tech-cn:/v2-staging-rebase
unity-tech-cn:/release_15_branch
unity-tech-cn:/release_15_removeendepisode
unity-tech-cn:/release_16_branch
unity-tech-cn:/release_16_fix_gridsensor
unity-tech-cn:/ai-hw-2021
unity-tech-cn:/check-for-ModelOverriders
unity-tech-cn:/fix-grid-obs-shape-init
unity-tech-cn:/fix-gym-needs-reset
unity-tech-cn:/fix-resume-imi
unity-tech-cn:/release_17_branch
unity-tech-cn:/release_17_branch_gpu_test
unity-tech-cn:/colab-links
unity-tech-cn:/exp-continuous-div
unity-tech-cn:/release_17_branch_gpu_2
unity-tech-cn:/exp-diverse-behavior
unity-tech-cn:/grid-onehot-extra-dim-empty
unity-tech-cn:/2.0-verified
unity-tech-cn:/faster-entropy-coeficient-convergence
unity-tech-cn:/pre-r18-update-changelog
unity-tech-cn:/release_18_branch
unity-tech-cn:/main/tracking
unity-tech-cn:/main/reward-providers
unity-tech-cn:/main/project-upgrade
unity-tech-cn:/main/limitation-docs
unity-tech-cn:/develop/nomaxstep-test
unity-tech-cn:/develop/tf2.0
unity-tech-cn:/develop/tanhsquash
unity-tech-cn:/develop/magic-string
unity-tech-cn:/develop/trainerinterface
unity-tech-cn:/develop/separatevalue
unity-tech-cn:/develop/nopreviousactions
unity-tech-cn:/develop/reenablerepeatactions
unity-tech-cn:/develop/0memories
unity-tech-cn:/develop/fixmemoryleak
unity-tech-cn:/develop/reducewalljump
unity-tech-cn:/develop/removeactionholder-onehot
unity-tech-cn:/develop/canonicalize-quaternions
unity-tech-cn:/develop/self-playassym
unity-tech-cn:/develop/demo-load-seek
unity-tech-cn:/develop/progress-bar
unity-tech-cn:/develop/sac-apex
unity-tech-cn:/develop/cubewars
unity-tech-cn:/develop/add-fire
unity-tech-cn:/develop/gym-wrapper
unity-tech-cn:/develop/mm-docs-main-readme
unity-tech-cn:/develop/mm-docs-overview
unity-tech-cn:/develop/no-threading
unity-tech-cn:/develop/dockerfile
unity-tech-cn:/develop/model-store
unity-tech-cn:/develop/checkout-conversion-rebase
unity-tech-cn:/develop/model-transfer
unity-tech-cn:/develop/bisim-review
unity-tech-cn:/develop/taggedobservations
unity-tech-cn:/develop/transfer-bisim
unity-tech-cn:/develop/bisim-sac-transfer
unity-tech-cn:/develop/basketball
unity-tech-cn:/develop/torchmodules
unity-tech-cn:/develop/fixmarkdown
unity-tech-cn:/develop/shortenstrikervsgoalie
unity-tech-cn:/develop/shortengoalie
unity-tech-cn:/develop/torch-save-rp
unity-tech-cn:/develop/torch-to-np
unity-tech-cn:/develop/torch-omp-no-thread
unity-tech-cn:/develop/actionmodel-csharp
unity-tech-cn:/develop/torch-extra
unity-tech-cn:/develop/restructure-torch-networks
unity-tech-cn:/develop/jit
unity-tech-cn:/develop/adjust-cpu-settings-experiment
unity-tech-cn:/develop/torch-sac-threading
unity-tech-cn:/develop/wb
unity-tech-cn:/develop/amrl
unity-tech-cn:/develop/memorydump
unity-tech-cn:/develop/permutepytorch
unity-tech-cn:/develop/sac-targetq
unity-tech-cn:/develop/actions-out
unity-tech-cn:/develop/reshapeonnxmemories
unity-tech-cn:/develop/crawlergail
unity-tech-cn:/develop/debugtorchfood
unity-tech-cn:/develop/hybrid-actions
unity-tech-cn:/develop/bullet-hell
unity-tech-cn:/develop/action-spec-gym
unity-tech-cn:/develop/battlefoodcollector
unity-tech-cn:/develop/use-action-buffers
unity-tech-cn:/develop/hardswish
unity-tech-cn:/develop/leakyrelu
unity-tech-cn:/develop/torch-clip-scale
unity-tech-cn:/develop/contentropy
unity-tech-cn:/develop/manch
unity-tech-cn:/develop/torchcrawlerdebug
unity-tech-cn:/develop/fix-nan
unity-tech-cn:/develop/multitype-buffer
unity-tech-cn:/develop/windows-delay
unity-tech-cn:/develop/torch-tanh
unity-tech-cn:/develop/gail-norm
unity-tech-cn:/develop/multiprocess
unity-tech-cn:/develop/unified-obs
unity-tech-cn:/develop/rm-rf-new-models
unity-tech-cn:/develop/skipcritic
unity-tech-cn:/develop/centralizedcritic
unity-tech-cn:/develop/dodgeball-tests
unity-tech-cn:/develop/cc-teammanager
unity-tech-cn:/develop/weight-decay
unity-tech-cn:/develop/singular-embeddings
unity-tech-cn:/develop/zombieteammanager
unity-tech-cn:/develop/superpush
unity-tech-cn:/develop/teammanager
unity-tech-cn:/develop/zombie-exp
unity-tech-cn:/develop/update-readme
unity-tech-cn:/develop/readme-fix
unity-tech-cn:/develop/coma-noact
unity-tech-cn:/develop/coma-withq
unity-tech-cn:/develop/coma2
unity-tech-cn:/develop/action-slice
unity-tech-cn:/develop/gru
unity-tech-cn:/develop/critic-op-lstm-currentmem
unity-tech-cn:/develop/decaygail
unity-tech-cn:/develop/gail-srl-hack
unity-tech-cn:/develop/rear-pad
unity-tech-cn:/develop/mm-copyright-dates
unity-tech-cn:/develop/dodgeball-raycasts
unity-tech-cn:/develop/collab-envs-exp-ervin
unity-tech-cn:/develop/pushcollabonly
unity-tech-cn:/develop/sample-curation
unity-tech-cn:/develop/soccer-groupman
unity-tech-cn:/develop/input-actuator-tanks
unity-tech-cn:/develop/validate-release-fix
unity-tech-cn:/develop/new-console-log
unity-tech-cn:/develop/lex-walker-model
unity-tech-cn:/develop/lstm-burnin
unity-tech-cn:/develop/grid-vaiable-names
unity-tech-cn:/develop/fix-attn-embedding
unity-tech-cn:/develop/api-documentation-update-some-fixes
unity-tech-cn:/develop/update-grpc
unity-tech-cn:/develop/grid-rootref-debug
unity-tech-cn:/develop/pbcollab-rays
unity-tech-cn:/develop/2.0-verified-pre
unity-tech-cn:/develop/parameterizedenvs
unity-tech-cn:/develop/custom-ray-sensor
unity-tech-cn:/develop/mm-add-v2blog
unity-tech-cn:/develop/custom-raycast
unity-tech-cn:/develop/area-manager
unity-tech-cn:/develop/remove-unecessary-lr
unity-tech-cn:/develop/use-base-env-in-learn
unity-tech-cn:/soccer-fives/multiagent
unity-tech-cn:/develop/cubewars/splashdamage
unity-tech-cn:/develop/add-fire/exp
unity-tech-cn:/develop/add-fire/jit
unity-tech-cn:/develop/add-fire/speedtest
unity-tech-cn:/develop/add-fire/bc
unity-tech-cn:/develop/add-fire/ckpt-2
unity-tech-cn:/develop/add-fire/normalize-context
unity-tech-cn:/develop/add-fire/components-dir
unity-tech-cn:/develop/add-fire/halfentropy
unity-tech-cn:/develop/add-fire/memoryclass
unity-tech-cn:/develop/add-fire/categoricaldist
unity-tech-cn:/develop/add-fire/mm
unity-tech-cn:/develop/add-fire/sac-lst
unity-tech-cn:/develop/add-fire/mm3
unity-tech-cn:/develop/add-fire/continuous
unity-tech-cn:/develop/add-fire/ghost
unity-tech-cn:/develop/add-fire/policy-tests
unity-tech-cn:/develop/add-fire/export-discrete
unity-tech-cn:/develop/add-fire/test-simple-rl-fix-resnet
unity-tech-cn:/develop/add-fire/remove-currdoc
unity-tech-cn:/develop/add-fire/clean2
unity-tech-cn:/develop/add-fire/doc-cleanups
unity-tech-cn:/develop/add-fire/changelog
unity-tech-cn:/develop/add-fire/mm2
unity-tech-cn:/develop/model-transfer/add-physics
unity-tech-cn:/develop/model-transfer/train
unity-tech-cn:/develop/jit/experiments
unity-tech-cn:/exp-vince/sep30-2020
unity-tech-cn:/hh-develop-gridsensor-tests/static
unity-tech-cn:/develop/hybrid-actions/distlist
unity-tech-cn:/develop/bullet-hell/buffer
unity-tech-cn:/goal-conditioning/new
unity-tech-cn:/goal-conditioning/sensors-2
unity-tech-cn:/goal-conditioning/sensors-3-pytest-fix
unity-tech-cn:/goal-conditioning/grid-world
unity-tech-cn:/soccer-comms/disc
unity-tech-cn:/develop/centralizedcritic/counterfact
unity-tech-cn:/develop/centralizedcritic/mm
unity-tech-cn:/develop/centralizedcritic/nonego
unity-tech-cn:/develop/zombieteammanager/disableagent
unity-tech-cn:/develop/zombieteammanager/killfirst
unity-tech-cn:/develop/superpush/int
unity-tech-cn:/develop/superpush/branch-cleanup
unity-tech-cn:/develop/teammanager/int
unity-tech-cn:/develop/teammanager/cubewar-nocycle
unity-tech-cn:/develop/teammanager/cubewars
unity-tech-cn:/develop/superpush/int/hunter
unity-tech-cn:/goal-conditioning/new/allo-crawler
unity-tech-cn:/develop/coma2/clip
unity-tech-cn:/develop/coma2/singlenetwork
unity-tech-cn:/develop/coma2/samenet
unity-tech-cn:/develop/coma2/fixgroup
unity-tech-cn:/develop/coma2/samenet/sum
unity-tech-cn:/hh-develop-dodgeball/goy-input
unity-tech-cn:/develop/soccer-groupman/mod
unity-tech-cn:/develop/soccer-groupman/mod/hunter
unity-tech-cn:/develop/soccer-groupman/mod/hunter/cine
unity-tech-cn:/ai-hw-2021/tensor-applier
拉取从: unity-tech-cn:develop/action-slice
unity-tech-cn:/main
unity-tech-cn:/develop-generalizationTraining-TrainerController
unity-tech-cn:/tag-0.2.0
unity-tech-cn:/tag-0.2.1
unity-tech-cn:/tag-0.2.1a
unity-tech-cn:/tag-0.2.1c
unity-tech-cn:/tag-0.2.1d
unity-tech-cn:/hotfix-v0.9.2a
unity-tech-cn:/develop-gpu-test
unity-tech-cn:/0.10.1
unity-tech-cn:/develop-pyinstaller
unity-tech-cn:/develop-horovod
unity-tech-cn:/PhysXArticulations20201
unity-tech-cn:/importdocfix
unity-tech-cn:/develop-resizetexture
unity-tech-cn:/hh-develop-walljump_bugfixes
unity-tech-cn:/develop-walljump-fix-sac
unity-tech-cn:/hh-develop-walljump_rnd
unity-tech-cn:/tag-0.11.0.dev0
unity-tech-cn:/develop-pytorch
unity-tech-cn:/tag-0.11.0.dev2
unity-tech-cn:/develop-newnormalization
unity-tech-cn:/tag-0.11.0.dev3
unity-tech-cn:/develop
unity-tech-cn:/release-0.12.0
unity-tech-cn:/tag-0.12.0-dev
unity-tech-cn:/tag-0.12.0.dev0
unity-tech-cn:/tag-0.12.1
unity-tech-cn:/2D-explorations
unity-tech-cn:/asymm-envs
unity-tech-cn:/tag-0.12.1.dev0
unity-tech-cn:/2D-exploration-raycast
unity-tech-cn:/tag-0.12.1.dev1
unity-tech-cn:/release-0.13.0
unity-tech-cn:/release-0.13.1
unity-tech-cn:/plugin-proof-of-concept
unity-tech-cn:/release-0.14.0
unity-tech-cn:/hotfix-bump-version-master
unity-tech-cn:/soccer-fives
unity-tech-cn:/release-0.14.1
unity-tech-cn:/bug-failed-api-check
unity-tech-cn:/test-recurrent-gail
unity-tech-cn:/hh-add-icons
unity-tech-cn:/release-0.15.0
unity-tech-cn:/release-0.15.1
unity-tech-cn:/hh-develop-all-posed-characters
unity-tech-cn:/internal-policy-ghost
unity-tech-cn:/distributed-training
unity-tech-cn:/hh-develop-improve_tennis
unity-tech-cn:/test-tf-ver
unity-tech-cn:/release_1_branch
unity-tech-cn:/tennis-time-horizon
unity-tech-cn:/whitepaper-experiments
unity-tech-cn:/r2v-yamato-linux
unity-tech-cn:/docs-update
unity-tech-cn:/release_2_branch
unity-tech-cn:/exp-mede
unity-tech-cn:/sensitivity
unity-tech-cn:/release_2_verified_load_fix
unity-tech-cn:/test-sampler
unity-tech-cn:/release_2_verified
unity-tech-cn:/hh-develop-ragdoll-testing
unity-tech-cn:/origin-develop-taggedobservations
unity-tech-cn:/MLA-1734-demo-provider
unity-tech-cn:/sampler-refactor-copy
unity-tech-cn:/PhysXArticulations20201Package
unity-tech-cn:/tag-com.unity.ml-agents_1.0.8
unity-tech-cn:/release_3_branch
unity-tech-cn:/github-actions
unity-tech-cn:/release_3_distributed
unity-tech-cn:/fix-batch-tennis
unity-tech-cn:/distributed-ppo-sac
unity-tech-cn:/gridworld-custom-obs
unity-tech-cn:/hw20-segmentation
unity-tech-cn:/hh-develop-gamedev-demo
unity-tech-cn:/active-variablespeed
unity-tech-cn:/release_4_branch
unity-tech-cn:/fix-env-step-loop
unity-tech-cn:/release_5_branch
unity-tech-cn:/fix-walker
unity-tech-cn:/release_6_branch
unity-tech-cn:/hh-32-observation-crawler
unity-tech-cn:/trainer-plugin
unity-tech-cn:/hh-develop-max-steps-demo-recorder
unity-tech-cn:/hh-develop-loco-walker-variable-speed
unity-tech-cn:/exp-0002
unity-tech-cn:/experiment-less-max-step
unity-tech-cn:/hh-develop-hallway-wall-mesh-fix
unity-tech-cn:/release_7_branch
unity-tech-cn:/exp-vince
unity-tech-cn:/hh-develop-gridsensor-tests
unity-tech-cn:/tag-release_8_test0
unity-tech-cn:/tag-release_8_test1
unity-tech-cn:/release_8_branch
unity-tech-cn:/docfix-end-episode
unity-tech-cn:/release_9_branch
unity-tech-cn:/hybrid-action-rewardsignals
unity-tech-cn:/MLA-462-yamato-win
unity-tech-cn:/exp-alternate-atten
unity-tech-cn:/hh-develop-fps_game_project
unity-tech-cn:/fix-conflict-base-env
unity-tech-cn:/release_10_branch
unity-tech-cn:/exp-bullet-hell-trainer
unity-tech-cn:/ai-summit-exp
unity-tech-cn:/comms-grad
unity-tech-cn:/walljump-pushblock
unity-tech-cn:/goal-conditioning
unity-tech-cn:/release_11_branch
unity-tech-cn:/hh-develop-water-balloon-fight
unity-tech-cn:/gc-hyper
unity-tech-cn:/layernorm
unity-tech-cn:/yamato-linux-debug-venv
unity-tech-cn:/soccer-comms
unity-tech-cn:/hh-develop-pushblockcollab
unity-tech-cn:/release_12_branch
unity-tech-cn:/fix-get-step-sp-curr
unity-tech-cn:/continuous-comms
unity-tech-cn:/no-comms
unity-tech-cn:/hh-develop-zombiepushblock
unity-tech-cn:/hypernetwork
unity-tech-cn:/revert-4859-develop-update-readme
unity-tech-cn:/sequencer-env-attention
unity-tech-cn:/hh-develop-variableobs
unity-tech-cn:/exp-tanh
unity-tech-cn:/reward-dist
unity-tech-cn:/exp-weight-decay
unity-tech-cn:/exp-robot
unity-tech-cn:/bullet-hell-barracuda-test-1.3.1
unity-tech-cn:/release_13_branch
unity-tech-cn:/release_14_branch
unity-tech-cn:/exp-clipped-gaussian-entropy
unity-tech-cn:/tic-tac-toe
unity-tech-cn:/hh-develop-dodgeball
unity-tech-cn:/repro-vis-obs-perf
unity-tech-cn:/v2-staging-rebase
unity-tech-cn:/release_15_branch
unity-tech-cn:/release_15_removeendepisode
unity-tech-cn:/release_16_branch
unity-tech-cn:/release_16_fix_gridsensor
unity-tech-cn:/ai-hw-2021
unity-tech-cn:/check-for-ModelOverriders
unity-tech-cn:/fix-grid-obs-shape-init
unity-tech-cn:/fix-gym-needs-reset
unity-tech-cn:/fix-resume-imi
unity-tech-cn:/release_17_branch
unity-tech-cn:/release_17_branch_gpu_test
unity-tech-cn:/colab-links
unity-tech-cn:/exp-continuous-div
unity-tech-cn:/release_17_branch_gpu_2
unity-tech-cn:/exp-diverse-behavior
unity-tech-cn:/grid-onehot-extra-dim-empty
unity-tech-cn:/2.0-verified
unity-tech-cn:/faster-entropy-coeficient-convergence
unity-tech-cn:/pre-r18-update-changelog
unity-tech-cn:/release_18_branch
unity-tech-cn:/main/tracking
unity-tech-cn:/main/reward-providers
unity-tech-cn:/main/project-upgrade
unity-tech-cn:/main/limitation-docs
unity-tech-cn:/develop/nomaxstep-test
unity-tech-cn:/develop/tf2.0
unity-tech-cn:/develop/tanhsquash
unity-tech-cn:/develop/magic-string
unity-tech-cn:/develop/trainerinterface
unity-tech-cn:/develop/separatevalue
unity-tech-cn:/develop/nopreviousactions
unity-tech-cn:/develop/reenablerepeatactions
unity-tech-cn:/develop/0memories
unity-tech-cn:/develop/fixmemoryleak
unity-tech-cn:/develop/reducewalljump
unity-tech-cn:/develop/removeactionholder-onehot
unity-tech-cn:/develop/canonicalize-quaternions
unity-tech-cn:/develop/self-playassym
unity-tech-cn:/develop/demo-load-seek
unity-tech-cn:/develop/progress-bar
unity-tech-cn:/develop/sac-apex
unity-tech-cn:/develop/cubewars
unity-tech-cn:/develop/add-fire
unity-tech-cn:/develop/gym-wrapper
unity-tech-cn:/develop/mm-docs-main-readme
unity-tech-cn:/develop/mm-docs-overview
unity-tech-cn:/develop/no-threading
unity-tech-cn:/develop/dockerfile
unity-tech-cn:/develop/model-store
unity-tech-cn:/develop/checkout-conversion-rebase
unity-tech-cn:/develop/model-transfer
unity-tech-cn:/develop/bisim-review
unity-tech-cn:/develop/taggedobservations
unity-tech-cn:/develop/transfer-bisim
unity-tech-cn:/develop/bisim-sac-transfer
unity-tech-cn:/develop/basketball
unity-tech-cn:/develop/torchmodules
unity-tech-cn:/develop/fixmarkdown
unity-tech-cn:/develop/shortenstrikervsgoalie
unity-tech-cn:/develop/shortengoalie
unity-tech-cn:/develop/torch-save-rp
unity-tech-cn:/develop/torch-to-np
unity-tech-cn:/develop/torch-omp-no-thread
unity-tech-cn:/develop/actionmodel-csharp
unity-tech-cn:/develop/torch-extra
unity-tech-cn:/develop/restructure-torch-networks
unity-tech-cn:/develop/jit
unity-tech-cn:/develop/adjust-cpu-settings-experiment
unity-tech-cn:/develop/torch-sac-threading
unity-tech-cn:/develop/wb
unity-tech-cn:/develop/amrl
unity-tech-cn:/develop/memorydump
unity-tech-cn:/develop/permutepytorch
unity-tech-cn:/develop/sac-targetq
unity-tech-cn:/develop/actions-out
unity-tech-cn:/develop/reshapeonnxmemories
unity-tech-cn:/develop/crawlergail
unity-tech-cn:/develop/debugtorchfood
unity-tech-cn:/develop/hybrid-actions
unity-tech-cn:/develop/bullet-hell
unity-tech-cn:/develop/action-spec-gym
unity-tech-cn:/develop/battlefoodcollector
unity-tech-cn:/develop/use-action-buffers
unity-tech-cn:/develop/hardswish
unity-tech-cn:/develop/leakyrelu
unity-tech-cn:/develop/torch-clip-scale
unity-tech-cn:/develop/contentropy
unity-tech-cn:/develop/manch
unity-tech-cn:/develop/torchcrawlerdebug
unity-tech-cn:/develop/fix-nan
unity-tech-cn:/develop/multitype-buffer
unity-tech-cn:/develop/windows-delay
unity-tech-cn:/develop/torch-tanh
unity-tech-cn:/develop/gail-norm
unity-tech-cn:/develop/multiprocess
unity-tech-cn:/develop/unified-obs
unity-tech-cn:/develop/rm-rf-new-models
unity-tech-cn:/develop/skipcritic
unity-tech-cn:/develop/centralizedcritic
unity-tech-cn:/develop/dodgeball-tests
unity-tech-cn:/develop/cc-teammanager
unity-tech-cn:/develop/weight-decay
unity-tech-cn:/develop/singular-embeddings
unity-tech-cn:/develop/zombieteammanager
unity-tech-cn:/develop/superpush
unity-tech-cn:/develop/teammanager
unity-tech-cn:/develop/zombie-exp
unity-tech-cn:/develop/update-readme
unity-tech-cn:/develop/readme-fix
unity-tech-cn:/develop/coma-noact
unity-tech-cn:/develop/coma-withq
unity-tech-cn:/develop/coma2
unity-tech-cn:/develop/action-slice
unity-tech-cn:/develop/gru
unity-tech-cn:/develop/critic-op-lstm-currentmem
unity-tech-cn:/develop/decaygail
unity-tech-cn:/develop/gail-srl-hack
unity-tech-cn:/develop/rear-pad
unity-tech-cn:/develop/mm-copyright-dates
unity-tech-cn:/develop/dodgeball-raycasts
unity-tech-cn:/develop/collab-envs-exp-ervin
unity-tech-cn:/develop/pushcollabonly
unity-tech-cn:/develop/sample-curation
unity-tech-cn:/develop/soccer-groupman
unity-tech-cn:/develop/input-actuator-tanks
unity-tech-cn:/develop/validate-release-fix
unity-tech-cn:/develop/new-console-log
unity-tech-cn:/develop/lex-walker-model
unity-tech-cn:/develop/lstm-burnin
unity-tech-cn:/develop/grid-vaiable-names
unity-tech-cn:/develop/fix-attn-embedding
unity-tech-cn:/develop/api-documentation-update-some-fixes
unity-tech-cn:/develop/update-grpc
unity-tech-cn:/develop/grid-rootref-debug
unity-tech-cn:/develop/pbcollab-rays
unity-tech-cn:/develop/2.0-verified-pre
unity-tech-cn:/develop/parameterizedenvs
unity-tech-cn:/develop/custom-ray-sensor
unity-tech-cn:/develop/mm-add-v2blog
unity-tech-cn:/develop/custom-raycast
unity-tech-cn:/develop/area-manager
unity-tech-cn:/develop/remove-unecessary-lr
unity-tech-cn:/develop/use-base-env-in-learn
unity-tech-cn:/soccer-fives/multiagent
unity-tech-cn:/develop/cubewars/splashdamage
unity-tech-cn:/develop/add-fire/exp
unity-tech-cn:/develop/add-fire/jit
unity-tech-cn:/develop/add-fire/speedtest
unity-tech-cn:/develop/add-fire/bc
unity-tech-cn:/develop/add-fire/ckpt-2
unity-tech-cn:/develop/add-fire/normalize-context
unity-tech-cn:/develop/add-fire/components-dir
unity-tech-cn:/develop/add-fire/halfentropy
unity-tech-cn:/develop/add-fire/memoryclass
unity-tech-cn:/develop/add-fire/categoricaldist
unity-tech-cn:/develop/add-fire/mm
unity-tech-cn:/develop/add-fire/sac-lst
unity-tech-cn:/develop/add-fire/mm3
unity-tech-cn:/develop/add-fire/continuous
unity-tech-cn:/develop/add-fire/ghost
unity-tech-cn:/develop/add-fire/policy-tests
unity-tech-cn:/develop/add-fire/export-discrete
unity-tech-cn:/develop/add-fire/test-simple-rl-fix-resnet
unity-tech-cn:/develop/add-fire/remove-currdoc
unity-tech-cn:/develop/add-fire/clean2
unity-tech-cn:/develop/add-fire/doc-cleanups
unity-tech-cn:/develop/add-fire/changelog
unity-tech-cn:/develop/add-fire/mm2
unity-tech-cn:/develop/model-transfer/add-physics
unity-tech-cn:/develop/model-transfer/train
unity-tech-cn:/develop/jit/experiments
unity-tech-cn:/exp-vince/sep30-2020
unity-tech-cn:/hh-develop-gridsensor-tests/static
unity-tech-cn:/develop/hybrid-actions/distlist
unity-tech-cn:/develop/bullet-hell/buffer
unity-tech-cn:/goal-conditioning/new
unity-tech-cn:/goal-conditioning/sensors-2
unity-tech-cn:/goal-conditioning/sensors-3-pytest-fix
unity-tech-cn:/goal-conditioning/grid-world
unity-tech-cn:/soccer-comms/disc
unity-tech-cn:/develop/centralizedcritic/counterfact
unity-tech-cn:/develop/centralizedcritic/mm
unity-tech-cn:/develop/centralizedcritic/nonego
unity-tech-cn:/develop/zombieteammanager/disableagent
unity-tech-cn:/develop/zombieteammanager/killfirst
unity-tech-cn:/develop/superpush/int
unity-tech-cn:/develop/superpush/branch-cleanup
unity-tech-cn:/develop/teammanager/int
unity-tech-cn:/develop/teammanager/cubewar-nocycle
unity-tech-cn:/develop/teammanager/cubewars
unity-tech-cn:/develop/superpush/int/hunter
unity-tech-cn:/goal-conditioning/new/allo-crawler
unity-tech-cn:/develop/coma2/clip
unity-tech-cn:/develop/coma2/singlenetwork
unity-tech-cn:/develop/coma2/samenet
unity-tech-cn:/develop/coma2/fixgroup
unity-tech-cn:/develop/coma2/samenet/sum
unity-tech-cn:/hh-develop-dodgeball/goy-input
unity-tech-cn:/develop/soccer-groupman/mod
unity-tech-cn:/develop/soccer-groupman/mod/hunter
unity-tech-cn:/develop/soccer-groupman/mod/hunter/cine
unity-tech-cn:/ai-hw-2021/tensor-applier
此合并请求有变更与目标分支冲突。
/ml-agents/mlagents/trainers/settings.py
/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
/ml-agents/mlagents/trainers/trainer/trainer_factory.py
/ml-agents/mlagents/trainers/tests/mock_brain.py
/ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_extrinsic.py
/ml-agents/mlagents/trainers/buffer.py
/ml-agents/mlagents/trainers/torch/components/reward_providers/extrinsic_reward_provider.py
/ml-agents/mlagents/trainers/torch/agent_action.py
/ml-agents/mlagents/trainers/torch/utils.py
/ml-agents/mlagents/trainers/torch/networks.py
13 次代码提交
作者 | SHA1 | 备注 | 提交日期 |
---|---|---|---|
Andrew Cohen | f165bfb5 | update comment | 4 年前 |
Andrew Cohen | 95f62362 | add test | 4 年前 |
Andrew Cohen | cb13a8ca | add type/docstring to slice | 4 年前 |
Andrew Cohen | 0afe5f24 | add slice function to agent action | 4 年前 |
Ervin Teng | ac0b56bb | Fix pypi issues | 4 年前 |
GitHub | 6ae8ea1e |
[coma2] Add support for variable length obs in COMA2 (#5038)
* Make group extrinsic part of extrinsic * Fix test and init * Fix tests and bug * Add baseline loss to TensorBoard * Add support for variable len obs in COMA2 * Remove weird merge artifact * Make agent action run * Fix __getitem__ replace with slice * Revert "Fix __getitem__ replace with slice" This reverts commit 87a2c9d9a9342a7d2be4e9f620d1294a5c3bf22c. * Revert "Make agent action run" This reverts commit 59531f3746c58d62cf52f58a88e27a3e428e8946. |
4 年前 |
GitHub | ba2af269 |
[coma2] Make group extrinsic reward part of extrinsic (#5033)
* Make group extrinsic part of extrinsic * Fix test and init * Fix tests and bug * Add baseline loss to TensorBoard |
4 年前 |
Andrew Cohen | 4c56e6ad | lstm runs with coma | 4 年前 |
Andrew Cohen | 81524ee8 | lstm almost runs | 4 年前 |
Andrew Cohen | 8f799687 | ignoring precommit, grabbing baseline/critic mems from buffer in trainer | 4 年前 |
Andrew Cohen | 67beef88 | finished evaluate_by_seq, does not run | 4 年前 |
Andrew Cohen | 131fa328 | inital evaluate_by_seq, does not run | 4 年前 |
Ervin Teng | fd0dd35c | Merge branch 'main' into develop-coma2-trainer | 4 年前 |
共有 19 个文件被更改,包括 1659 次插入 和 92 次删除
-
17ml-agents/mlagents/trainers/settings.py
-
11ml-agents/mlagents/trainers/trainer/trainer_factory.py
-
2ml-agents/mlagents/trainers/tests/mock_brain.py
-
27ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_extrinsic.py
-
11ml-agents/mlagents/trainers/tests/torch/test_agent_action.py
-
6ml-agents/mlagents/trainers/buffer.py
-
28ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
-
97ml-agents/mlagents/trainers/torch/utils.py
-
33ml-agents/mlagents/trainers/torch/components/reward_providers/extrinsic_reward_provider.py
-
221ml-agents/mlagents/trainers/torch/networks.py
-
16ml-agents/mlagents/trainers/torch/agent_action.py
-
26config/ppo/PushBlockCollab.yaml
-
11com.unity.ml-agents/Runtime/MultiAgentGroupIdCounter.cs.meta
-
271ml-agents/mlagents/trainers/tests/torch/test_coma.py
-
0ml-agents/mlagents/trainers/coma/__init__.py
-
308ml-agents/mlagents/trainers/coma/trainer.py
-
655ml-agents/mlagents/trainers/coma/optimizer_torch.py
-
11com.unity.ml-agents/Runtime/MultiAgentGroupIdCounter.cs.meta
|
|||
behaviors: |
|||
PushBlock: |
|||
trainer_type: coma |
|||
hyperparameters: |
|||
batch_size: 1024 |
|||
buffer_size: 10240 |
|||
learning_rate: 0.0003 |
|||
beta: 0.01 |
|||
epsilon: 0.2 |
|||
lambd: 0.95 |
|||
num_epoch: 3 |
|||
learning_rate_schedule: constant |
|||
network_settings: |
|||
normalize: false |
|||
hidden_units: 256 |
|||
num_layers: 2 |
|||
vis_encode_type: simple |
|||
reward_signals: |
|||
extrinsic: |
|||
gamma: 0.99 |
|||
strength: 1.0 |
|||
keep_checkpoints: 5 |
|||
max_steps: 20000000 |
|||
time_horizon: 64 |
|||
summary_freq: 60000 |
|||
threaded: true |
|
|||
fileFormatVersion: 2 |
|||
guid: 5661ffdb6c7704e84bc785572dcd5bd1 |
|||
MonoImporter: |
|||
externalObjects: {} |
|||
serializedVersion: 2 |
|||
defaultReferences: [] |
|||
executionOrder: 0 |
|||
icon: {instanceID: 0} |
|||
userData: |
|||
assetBundleName: |
|||
assetBundleVariant: |
|
|||
import pytest |
|||
|
|||
import numpy as np |
|||
import attr |
|||
|
|||
from mlagents.trainers.coma.optimizer_torch import TorchCOMAOptimizer |
|||
from mlagents.trainers.settings import ExtrinsicSettings, RewardSignalType |
|||
|
|||
from mlagents.trainers.policy.torch_policy import TorchPolicy |
|||
from mlagents.trainers.tests import mock_brain as mb |
|||
from mlagents.trainers.tests.mock_brain import copy_buffer_fields |
|||
from mlagents.trainers.tests.test_trajectory import make_fake_trajectory |
|||
from mlagents.trainers.settings import NetworkSettings |
|||
from mlagents.trainers.tests.dummy_config import ( # noqa: F401 |
|||
ppo_dummy_config, |
|||
curiosity_dummy_config, |
|||
gail_dummy_config, |
|||
) |
|||
|
|||
from mlagents_envs.base_env import ActionSpec |
|||
from mlagents.trainers.buffer import BufferKey, RewardSignalUtil |
|||
|
|||
|
|||
@pytest.fixture |
|||
def dummy_config(): |
|||
# coma has the same hyperparameters as ppo for now |
|||
return ppo_dummy_config() |
|||
|
|||
|
|||
VECTOR_ACTION_SPACE = 2 |
|||
VECTOR_OBS_SPACE = 8 |
|||
DISCRETE_ACTION_SPACE = [3, 3, 3, 2] |
|||
BUFFER_INIT_SAMPLES = 64 |
|||
NUM_AGENTS = 4 |
|||
|
|||
CONTINUOUS_ACTION_SPEC = ActionSpec.create_continuous(VECTOR_ACTION_SPACE) |
|||
DISCRETE_ACTION_SPEC = ActionSpec.create_discrete(tuple(DISCRETE_ACTION_SPACE)) |
|||
|
|||
|
|||
def create_test_coma_optimizer(dummy_config, use_rnn, use_discrete, use_visual): |
|||
mock_specs = mb.setup_test_behavior_specs( |
|||
use_discrete, |
|||
use_visual, |
|||
vector_action_space=DISCRETE_ACTION_SPACE |
|||
if use_discrete |
|||
else VECTOR_ACTION_SPACE, |
|||
vector_obs_space=VECTOR_OBS_SPACE, |
|||
) |
|||
|
|||
trainer_settings = attr.evolve(dummy_config) |
|||
trainer_settings.reward_signals = { |
|||
RewardSignalType.EXTRINSIC: ExtrinsicSettings( |
|||
strength=1.0, gamma=0.99, add_groupmate_rewards=True |
|||
) |
|||
} |
|||
|
|||
trainer_settings.network_settings.memory = ( |
|||
NetworkSettings.MemorySettings(sequence_length=16, memory_size=10) |
|||
if use_rnn |
|||
else None |
|||
) |
|||
policy = TorchPolicy(0, mock_specs, trainer_settings, "test", False) |
|||
optimizer = TorchCOMAOptimizer(policy, trainer_settings) |
|||
return optimizer |
|||
|
|||
|
|||
@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"]) |
|||
@pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"]) |
|||
@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"]) |
|||
def test_coma_optimizer_update(dummy_config, rnn, visual, discrete): |
|||
# Test evaluate |
|||
optimizer = create_test_coma_optimizer( |
|||
dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual |
|||
) |
|||
# Test update |
|||
update_buffer = mb.simulate_rollout( |
|||
BUFFER_INIT_SAMPLES, |
|||
optimizer.policy.behavior_spec, |
|||
memory_size=optimizer.policy.m_size, |
|||
num_other_agents_in_group=NUM_AGENTS, |
|||
) |
|||
# Mock out reward signal eval |
|||
copy_buffer_fields( |
|||
update_buffer, |
|||
BufferKey.ENVIRONMENT_REWARDS, |
|||
[ |
|||
BufferKey.ADVANTAGES, |
|||
RewardSignalUtil.returns_key("group"), |
|||
RewardSignalUtil.value_estimates_key("group"), |
|||
RewardSignalUtil.baseline_estimates_key("group"), |
|||
], |
|||
) |
|||
# Copy memories to critic memories |
|||
copy_buffer_fields(update_buffer, BufferKey.MEMORY, [BufferKey.CRITIC_MEMORY]) |
|||
|
|||
return_stats = optimizer.update( |
|||
update_buffer, |
|||
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, |
|||
) |
|||
# Make sure we have the right stats |
|||
required_stats = [ |
|||
"Losses/Policy Loss", |
|||
"Losses/Value Loss", |
|||
"Policy/Learning Rate", |
|||
"Policy/Epsilon", |
|||
"Policy/Beta", |
|||
] |
|||
for stat in required_stats: |
|||
assert stat in return_stats.keys() |
|||
|
|||
|
|||
@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"]) |
|||
@pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"]) |
|||
@pytest.mark.parametrize("rnn", [False], ids=["no_rnn"]) |
|||
def test_coma_get_value_estimates(dummy_config, rnn, visual, discrete): |
|||
optimizer = create_test_coma_optimizer( |
|||
dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual |
|||
) |
|||
time_horizon = 15 |
|||
trajectory = make_fake_trajectory( |
|||
length=time_horizon, |
|||
observation_specs=optimizer.policy.behavior_spec.observation_specs, |
|||
action_spec=DISCRETE_ACTION_SPEC if discrete else CONTINUOUS_ACTION_SPEC, |
|||
max_step_complete=True, |
|||
num_other_agents_in_group=NUM_AGENTS, |
|||
) |
|||
( |
|||
value_estimates, |
|||
baseline_estimates, |
|||
next_value_estimates, |
|||
) = optimizer.get_trajectory_and_baseline_value_estimates( |
|||
trajectory.to_agentbuffer(), |
|||
trajectory.next_obs, |
|||
trajectory.next_group_obs, |
|||
done=False, |
|||
) |
|||
for key, val in value_estimates.items(): |
|||
assert type(key) is str |
|||
assert len(val) == 15 |
|||
for key, val in baseline_estimates.items(): |
|||
assert type(key) is str |
|||
assert len(val) == 15 |
|||
|
|||
# if all_memories is not None: |
|||
# assert len(all_memories) == 15 |
|||
|
|||
( |
|||
value_estimates, |
|||
baseline_estimates, |
|||
next_value_estimates, |
|||
) = optimizer.get_trajectory_and_baseline_value_estimates( |
|||
trajectory.to_agentbuffer(), |
|||
trajectory.next_obs, |
|||
trajectory.next_group_obs, |
|||
done=True, |
|||
) |
|||
for key, val in next_value_estimates.items(): |
|||
assert type(key) is str |
|||
assert val == 0.0 |
|||
|
|||
# Check if we ignore terminal states properly |
|||
optimizer.reward_signals["group"].use_terminal_states = False |
|||
( |
|||
value_estimates, |
|||
baseline_estimates, |
|||
next_value_estimates, |
|||
) = optimizer.get_trajectory_and_baseline_value_estimates( |
|||
trajectory.to_agentbuffer(), |
|||
trajectory.next_obs, |
|||
trajectory.next_group_obs, |
|||
done=False, |
|||
) |
|||
for key, val in next_value_estimates.items(): |
|||
assert type(key) is str |
|||
assert val != 0.0 |
|||
|
|||
|
|||
@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"]) |
|||
@pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"]) |
|||
@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"]) |
|||
# We need to test this separately from test_reward_signals.py to ensure no interactions |
|||
def test_ppo_optimizer_update_curiosity( |
|||
dummy_config, curiosity_dummy_config, rnn, visual, discrete # noqa: F811 |
|||
): |
|||
# Test evaluate |
|||
dummy_config.reward_signals = curiosity_dummy_config |
|||
optimizer = create_test_coma_optimizer( |
|||
dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual |
|||
) |
|||
# Test update |
|||
update_buffer = mb.simulate_rollout( |
|||
BUFFER_INIT_SAMPLES, |
|||
optimizer.policy.behavior_spec, |
|||
memory_size=optimizer.policy.m_size, |
|||
) |
|||
# Mock out reward signal eval |
|||
copy_buffer_fields( |
|||
update_buffer, |
|||
src_key=BufferKey.ENVIRONMENT_REWARDS, |
|||
dst_keys=[ |
|||
BufferKey.ADVANTAGES, |
|||
RewardSignalUtil.returns_key("extrinsic"), |
|||
RewardSignalUtil.value_estimates_key("extrinsic"), |
|||
RewardSignalUtil.returns_key("curiosity"), |
|||
RewardSignalUtil.value_estimates_key("curiosity"), |
|||
], |
|||
) |
|||
# Copy memories to critic memories |
|||
copy_buffer_fields(update_buffer, BufferKey.MEMORY, [BufferKey.CRITIC_MEMORY]) |
|||
|
|||
optimizer.update( |
|||
update_buffer, |
|||
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, |
|||
) |
|||
|
|||
|
|||
# We need to test this separately from test_reward_signals.py to ensure no interactions |
|||
def test_ppo_optimizer_update_gail(gail_dummy_config, dummy_config): # noqa: F811 |
|||
# Test evaluate |
|||
dummy_config.reward_signals = gail_dummy_config |
|||
config = ppo_dummy_config() |
|||
optimizer = create_test_coma_optimizer( |
|||
config, use_rnn=False, use_discrete=False, use_visual=False |
|||
) |
|||
# Test update |
|||
update_buffer = mb.simulate_rollout( |
|||
BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec |
|||
) |
|||
# Mock out reward signal eval |
|||
copy_buffer_fields( |
|||
update_buffer, |
|||
src_key=BufferKey.ENVIRONMENT_REWARDS, |
|||
dst_keys=[ |
|||
BufferKey.ADVANTAGES, |
|||
RewardSignalUtil.returns_key("extrinsic"), |
|||
RewardSignalUtil.value_estimates_key("extrinsic"), |
|||
RewardSignalUtil.returns_key("gail"), |
|||
RewardSignalUtil.value_estimates_key("gail"), |
|||
], |
|||
) |
|||
|
|||
update_buffer[BufferKey.CONTINUOUS_LOG_PROBS] = np.ones_like( |
|||
update_buffer[BufferKey.CONTINUOUS_ACTION] |
|||
) |
|||
optimizer.update( |
|||
update_buffer, |
|||
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, |
|||
) |
|||
|
|||
# Check if buffer size is too big |
|||
update_buffer = mb.simulate_rollout(3000, optimizer.policy.behavior_spec) |
|||
# Mock out reward signal eval |
|||
copy_buffer_fields( |
|||
update_buffer, |
|||
src_key=BufferKey.ENVIRONMENT_REWARDS, |
|||
dst_keys=[ |
|||
BufferKey.ADVANTAGES, |
|||
RewardSignalUtil.returns_key("extrinsic"), |
|||
RewardSignalUtil.value_estimates_key("extrinsic"), |
|||
RewardSignalUtil.returns_key("gail"), |
|||
RewardSignalUtil.value_estimates_key("gail"), |
|||
], |
|||
) |
|||
optimizer.update( |
|||
update_buffer, |
|||
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, |
|||
) |
|||
|
|||
|
|||
if __name__ == "__main__": |
|||
pytest.main() |
|
|||
# # Unity ML-Agents Toolkit |
|||
# ## ML-Agent Learning (PPO) |
|||
# Contains an implementation of PPO as described in: https://arxiv.org/abs/1707.06347 |
|||
|
|||
from collections import defaultdict |
|||
from typing import cast, Dict |
|||
|
|||
import numpy as np |
|||
|
|||
from mlagents_envs.side_channel.stats_side_channel import StatsAggregationMethod |
|||
from mlagents_envs.logging_util import get_logger |
|||
from mlagents_envs.base_env import BehaviorSpec |
|||
from mlagents.trainers.buffer import BufferKey, RewardSignalUtil |
|||
from mlagents.trainers.trainer.rl_trainer import RLTrainer |
|||
from mlagents.trainers.optimizer import Optimizer |
|||
from mlagents.trainers.policy import Policy |
|||
from mlagents.trainers.policy.torch_policy import TorchPolicy |
|||
from mlagents.trainers.coma.optimizer_torch import TorchCOMAOptimizer |
|||
from mlagents.trainers.trajectory import Trajectory |
|||
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers |
|||
from mlagents.trainers.settings import TrainerSettings, PPOSettings |
|||
|
|||
logger = get_logger(__name__) |
|||
|
|||
|
|||
class COMATrainer(RLTrainer): |
|||
"""The COMATrainer is an implementation of the COMA2 algorithm.""" |
|||
|
|||
def __init__( |
|||
self, |
|||
behavior_name: str, |
|||
reward_buff_cap: int, |
|||
trainer_settings: TrainerSettings, |
|||
training: bool, |
|||
load: bool, |
|||
seed: int, |
|||
artifact_path: str, |
|||
): |
|||
""" |
|||
Responsible for collecting experiences and training PPO model. |
|||
:param behavior_name: The name of the behavior associated with trainer config |
|||
:param reward_buff_cap: Max reward history to track in the reward buffer |
|||
:param trainer_settings: The parameters for the trainer. |
|||
:param training: Whether the trainer is set for training. |
|||
:param load: Whether the model should be loaded. |
|||
:param seed: The seed the model will be initialized with |
|||
:param artifact_path: The directory within which to store artifacts from this trainer. |
|||
""" |
|||
super().__init__( |
|||
behavior_name, |
|||
trainer_settings, |
|||
training, |
|||
load, |
|||
artifact_path, |
|||
reward_buff_cap, |
|||
) |
|||
self.hyperparameters: PPOSettings = cast( |
|||
PPOSettings, self.trainer_settings.hyperparameters |
|||
) |
|||
self.seed = seed |
|||
self.policy: Policy = None # type: ignore |
|||
self.collected_group_rewards: Dict[str, int] = defaultdict(lambda: 0) |
|||
|
|||
def _process_trajectory(self, trajectory: Trajectory) -> None: |
|||
""" |
|||
Takes a trajectory and processes it, putting it into the update buffer. |
|||
Processing involves calculating value and advantage targets for model updating step. |
|||
:param trajectory: The Trajectory tuple containing the steps to be processed. |
|||
""" |
|||
super()._process_trajectory(trajectory) |
|||
agent_id = trajectory.agent_id # All the agents should have the same ID |
|||
|
|||
agent_buffer_trajectory = trajectory.to_agentbuffer() |
|||
# Update the normalization |
|||
if self.is_training: |
|||
self.policy.update_normalization(agent_buffer_trajectory) |
|||
|
|||
# Get all value estimates |
|||
( |
|||
value_estimates, |
|||
baseline_estimates, |
|||
value_next, |
|||
value_memories, |
|||
baseline_memories, |
|||
) = self.optimizer.get_trajectory_and_baseline_value_estimates( |
|||
agent_buffer_trajectory, |
|||
trajectory.next_obs, |
|||
trajectory.next_group_obs, |
|||
trajectory.teammate_dones_reached |
|||
and trajectory.done_reached |
|||
and not trajectory.interrupted, |
|||
) |
|||
|
|||
if value_memories is not None and baseline_memories is not None: |
|||
agent_buffer_trajectory[BufferKey.CRITIC_MEMORY].set(value_memories) |
|||
agent_buffer_trajectory[BufferKey.BASELINE_MEMORY].set(baseline_memories) |
|||
|
|||
for name, v in value_estimates.items(): |
|||
agent_buffer_trajectory[RewardSignalUtil.value_estimates_key(name)].extend( |
|||
v |
|||
) |
|||
agent_buffer_trajectory[ |
|||
RewardSignalUtil.baseline_estimates_key(name) |
|||
].extend(baseline_estimates[name]) |
|||
self._stats_reporter.add_stat( |
|||
f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Baseline Estimate", |
|||
np.mean(baseline_estimates[name]), |
|||
) |
|||
self._stats_reporter.add_stat( |
|||
f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value Estimate", |
|||
np.mean(value_estimates[name]), |
|||
) |
|||
|
|||
self.collected_rewards["environment"][agent_id] += np.sum( |
|||
agent_buffer_trajectory[BufferKey.ENVIRONMENT_REWARDS] |
|||
) |
|||
self.collected_group_rewards[agent_id] += np.sum( |
|||
agent_buffer_trajectory[BufferKey.GROUP_REWARD] |
|||
) |
|||
for name, reward_signal in self.optimizer.reward_signals.items(): |
|||
evaluate_result = ( |
|||
reward_signal.evaluate(agent_buffer_trajectory) * reward_signal.strength |
|||
) |
|||
agent_buffer_trajectory[RewardSignalUtil.rewards_key(name)].extend( |
|||
evaluate_result |
|||
) |
|||
# Report the reward signals |
|||
self.collected_rewards[name][agent_id] += np.sum(evaluate_result) |
|||
|
|||
# Compute lambda returns and advantage |
|||
tmp_advantages = [] |
|||
for name in self.optimizer.reward_signals: |
|||
|
|||
local_rewards = np.array( |
|||
agent_buffer_trajectory[RewardSignalUtil.rewards_key(name)].get_batch(), |
|||
dtype=np.float32, |
|||
) |
|||
|
|||
baseline_estimate = agent_buffer_trajectory[ |
|||
RewardSignalUtil.baseline_estimates_key(name) |
|||
].get_batch() |
|||
v_estimates = agent_buffer_trajectory[ |
|||
RewardSignalUtil.value_estimates_key(name) |
|||
].get_batch() |
|||
|
|||
lambd_returns = lambda_return( |
|||
r=local_rewards, |
|||
value_estimates=v_estimates, |
|||
gamma=self.optimizer.reward_signals[name].gamma, |
|||
lambd=self.hyperparameters.lambd, |
|||
value_next=value_next[name], |
|||
) |
|||
|
|||
local_advantage = np.array(lambd_returns) - np.array(baseline_estimate) |
|||
|
|||
agent_buffer_trajectory[RewardSignalUtil.returns_key(name)].set( |
|||
lambd_returns |
|||
) |
|||
agent_buffer_trajectory[RewardSignalUtil.advantage_key(name)].set( |
|||
local_advantage |
|||
) |
|||
tmp_advantages.append(local_advantage) |
|||
|
|||
# Get global advantages |
|||
global_advantages = list( |
|||
np.mean(np.array(tmp_advantages, dtype=np.float32), axis=0) |
|||
) |
|||
agent_buffer_trajectory[BufferKey.ADVANTAGES].set(global_advantages) |
|||
|
|||
# Append to update buffer |
|||
agent_buffer_trajectory.resequence_and_append( |
|||
self.update_buffer, training_length=self.policy.sequence_length |
|||
) |
|||
|
|||
# If this was a terminal trajectory, append stats and reset reward collection |
|||
if trajectory.done_reached: |
|||
self._update_end_episode_stats(agent_id, self.optimizer) |
|||
|
|||
def _is_ready_update(self): |
|||
""" |
|||
Returns whether or not the trainer has enough elements to run update model |
|||
:return: A boolean corresponding to whether or not update_model() can be run |
|||
""" |
|||
size_of_buffer = self.update_buffer.num_experiences |
|||
return size_of_buffer > self.hyperparameters.buffer_size |
|||
|
|||
def _update_policy(self): |
|||
""" |
|||
Uses demonstration_buffer to update the policy. |
|||
The reward signal generators must be updated in this method at their own pace. |
|||
""" |
|||
buffer_length = self.update_buffer.num_experiences |
|||
self.cumulative_returns_since_policy_update.clear() |
|||
|
|||
# Make sure batch_size is a multiple of sequence length. During training, we |
|||
# will need to reshape the data into a batch_size x sequence_length tensor. |
|||
batch_size = ( |
|||
self.hyperparameters.batch_size |
|||
- self.hyperparameters.batch_size % self.policy.sequence_length |
|||
) |
|||
# Make sure there is at least one sequence |
|||
batch_size = max(batch_size, self.policy.sequence_length) |
|||
|
|||
n_sequences = max( |
|||
int(self.hyperparameters.batch_size / self.policy.sequence_length), 1 |
|||
) |
|||
|
|||
advantages = np.array( |
|||
self.update_buffer[BufferKey.ADVANTAGES].get_batch(), dtype=np.float32 |
|||
) |
|||
self.update_buffer[BufferKey.ADVANTAGES].set( |
|||
(advantages - advantages.mean()) / (advantages.std() + 1e-10) |
|||
) |
|||
num_epoch = self.hyperparameters.num_epoch |
|||
batch_update_stats = defaultdict(list) |
|||
for _ in range(num_epoch): |
|||
self.update_buffer.shuffle(sequence_length=self.policy.sequence_length) |
|||
buffer = self.update_buffer |
|||
max_num_batch = buffer_length // batch_size |
|||
for i in range(0, max_num_batch * batch_size, batch_size): |
|||
update_stats = self.optimizer.update( |
|||
buffer.make_mini_batch(i, i + batch_size), n_sequences |
|||
) |
|||
for stat_name, value in update_stats.items(): |
|||
batch_update_stats[stat_name].append(value) |
|||
|
|||
for stat, stat_list in batch_update_stats.items(): |
|||
self._stats_reporter.add_stat(stat, np.mean(stat_list)) |
|||
|
|||
if self.optimizer.bc_module: |
|||
update_stats = self.optimizer.bc_module.update() |
|||
for stat, val in update_stats.items(): |
|||
self._stats_reporter.add_stat(stat, val) |
|||
self._clear_update_buffer() |
|||
return True |
|||
|
|||
def create_torch_policy( |
|||
self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec |
|||
) -> TorchPolicy: |
|||
""" |
|||
Creates a policy with a PyTorch backend and PPO hyperparameters |
|||
:param parsed_behavior_id: |
|||
:param behavior_spec: specifications for policy construction |
|||
:return policy |
|||
""" |
|||
policy = TorchPolicy( |
|||
self.seed, |
|||
behavior_spec, |
|||
self.trainer_settings, |
|||
condition_sigma_on_obs=False, # Faster training for PPO |
|||
separate_critic=True, # Match network architecture with TF |
|||
) |
|||
return policy |
|||
|
|||
def create_coma_optimizer(self) -> TorchCOMAOptimizer: |
|||
return TorchCOMAOptimizer( # type: ignore |
|||
cast(TorchPolicy, self.policy), self.trainer_settings # type: ignore |
|||
) # type: ignore |
|||
|
|||
def add_policy( |
|||
self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy |
|||
) -> None: |
|||
""" |
|||
Adds policy to trainer. |
|||
:param parsed_behavior_id: Behavior identifiers that the policy should belong to. |
|||
:param policy: Policy to associate with name_behavior_id. |
|||
""" |
|||
self.policy = policy |
|||
self.policies[parsed_behavior_id.behavior_id] = policy |
|||
self.optimizer = self.create_coma_optimizer() |
|||
for _reward_signal in self.optimizer.reward_signals.keys(): |
|||
self.collected_rewards[_reward_signal] = defaultdict(lambda: 0) |
|||
|
|||
self.model_saver.register(self.policy) |
|||
self.model_saver.register(self.optimizer) |
|||
self.model_saver.initialize_or_load() |
|||
|
|||
# Needed to resume loads properly |
|||
self.step = policy.get_current_step() |
|||
|
|||
def get_policy(self, name_behavior_id: str) -> Policy: |
|||
""" |
|||
Gets policy from trainer associated with name_behavior_id |
|||
:param name_behavior_id: full identifier of policy |
|||
""" |
|||
|
|||
return self.policy |
|||
|
|||
def _update_end_episode_stats(self, agent_id: str, optimizer: Optimizer) -> None: |
|||
super()._update_end_episode_stats(agent_id, optimizer) |
|||
self.stats_reporter.add_stat( |
|||
"Environment/Team Cumulative Reward", |
|||
self.collected_group_rewards.get(agent_id, 0), |
|||
aggregation=StatsAggregationMethod.HISTOGRAM, |
|||
) |
|||
self.collected_group_rewards.pop(agent_id) |
|||
|
|||
|
|||
def lambda_return(r, value_estimates, gamma=0.99, lambd=0.8, value_next=0.0): |
|||
returns = np.zeros_like(r) |
|||
returns[-1] = r[-1] + gamma * value_next |
|||
for t in reversed(range(0, r.size - 1)): |
|||
returns[t] = ( |
|||
gamma * lambd * returns[t + 1] |
|||
+ r[t] |
|||
+ (1 - lambd) * gamma * value_estimates[t + 1] |
|||
) |
|||
return returns |
|
|||
from typing import Dict, cast, List, Tuple, Optional |
|||
import numpy as np |
|||
import math |
|||
from mlagents.torch_utils import torch |
|||
|
|||
from mlagents.trainers.buffer import ( |
|||
AgentBuffer, |
|||
BufferKey, |
|||
RewardSignalUtil, |
|||
AgentBufferField, |
|||
) |
|||
|
|||
from mlagents_envs.timers import timed |
|||
from mlagents_envs.base_env import ObservationSpec, ActionSpec |
|||
from mlagents.trainers.policy.torch_policy import TorchPolicy |
|||
from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer |
|||
from mlagents.trainers.settings import ( |
|||
ExtrinsicSettings, |
|||
RewardSignalSettings, |
|||
RewardSignalType, |
|||
TrainerSettings, |
|||
PPOSettings, |
|||
) |
|||
from mlagents.trainers.torch.networks import Critic, MultiInputNetworkBody |
|||
from mlagents.trainers.torch.decoders import ValueHeads |
|||
from mlagents.trainers.torch.agent_action import AgentAction |
|||
from mlagents.trainers.torch.action_log_probs import ActionLogProbs |
|||
from mlagents.trainers.torch.utils import ModelUtils |
|||
from mlagents.trainers.trajectory import ObsUtil, GroupObsUtil |
|||
from mlagents.trainers.settings import NetworkSettings |
|||
|
|||
from mlagents_envs.logging_util import get_logger |
|||
|
|||
logger = get_logger(__name__) |
|||
|
|||
|
|||
class TorchCOMAOptimizer(TorchOptimizer): |
|||
class COMAValueNetwork(torch.nn.Module, Critic): |
|||
def __init__( |
|||
self, |
|||
stream_names: List[str], |
|||
observation_specs: List[ObservationSpec], |
|||
network_settings: NetworkSettings, |
|||
action_spec: ActionSpec, |
|||
): |
|||
torch.nn.Module.__init__(self) |
|||
self.network_body = MultiInputNetworkBody( |
|||
observation_specs, network_settings, action_spec |
|||
) |
|||
if network_settings.memory is not None: |
|||
encoding_size = network_settings.memory.memory_size // 2 |
|||
else: |
|||
encoding_size = network_settings.hidden_units |
|||
|
|||
self.value_heads = ValueHeads(stream_names, encoding_size, 1) |
|||
|
|||
@property |
|||
def memory_size(self) -> int: |
|||
return self.network_body.memory_size |
|||
|
|||
def update_normalization(self, buffer: AgentBuffer) -> None: |
|||
self.network_body.update_normalization(buffer) |
|||
|
|||
def baseline( |
|||
self, |
|||
self_obs: List[List[torch.Tensor]], |
|||
obs: List[List[torch.Tensor]], |
|||
actions: List[AgentAction], |
|||
memories: Optional[torch.Tensor] = None, |
|||
sequence_length: int = 1, |
|||
) -> Tuple[torch.Tensor, torch.Tensor]: |
|||
|
|||
encoding, memories = self.network_body( |
|||
obs_only=self_obs, |
|||
obs=obs, |
|||
actions=actions, |
|||
memories=memories, |
|||
sequence_length=sequence_length, |
|||
) |
|||
value_outputs, critic_mem_out = self.forward( |
|||
encoding, memories, sequence_length |
|||
) |
|||
return value_outputs, critic_mem_out |
|||
|
|||
def critic_pass( |
|||
self, |
|||
obs: List[List[torch.Tensor]], |
|||
memories: Optional[torch.Tensor] = None, |
|||
sequence_length: int = 1, |
|||
) -> Tuple[torch.Tensor, torch.Tensor]: |
|||
|
|||
encoding, memories = self.network_body( |
|||
obs_only=obs, |
|||
obs=[], |
|||
actions=[], |
|||
memories=memories, |
|||
sequence_length=sequence_length, |
|||
) |
|||
value_outputs, critic_mem_out = self.forward( |
|||
encoding, memories, sequence_length |
|||
) |
|||
return value_outputs, critic_mem_out |
|||
|
|||
def forward( |
|||
self, |
|||
encoding: torch.Tensor, |
|||
memories: Optional[torch.Tensor] = None, |
|||
sequence_length: int = 1, |
|||
) -> Tuple[torch.Tensor, torch.Tensor]: |
|||
|
|||
output = self.value_heads(encoding) |
|||
return output, memories |
|||
|
|||
def __init__(self, policy: TorchPolicy, trainer_settings: TrainerSettings): |
|||
""" |
|||
Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy. |
|||
The PPO optimizer has a value estimator and a loss function. |
|||
:param policy: A TorchPolicy object that will be updated by this PPO Optimizer. |
|||
:param trainer_params: Trainer parameters dictionary that specifies the |
|||
properties of the trainer. |
|||
""" |
|||
# Create the graph here to give more granular control of the TF graph to the Optimizer. |
|||
|
|||
super().__init__(policy, trainer_settings) |
|||
reward_signal_configs = trainer_settings.reward_signals |
|||
reward_signal_names = [key.value for key, _ in reward_signal_configs.items()] |
|||
|
|||
self._critic = TorchCOMAOptimizer.COMAValueNetwork( |
|||
reward_signal_names, |
|||
policy.behavior_spec.observation_specs, |
|||
network_settings=trainer_settings.network_settings, |
|||
action_spec=policy.behavior_spec.action_spec, |
|||
) |
|||
|
|||
params = list(self.policy.actor.parameters()) + list(self.critic.parameters()) |
|||
self.hyperparameters: PPOSettings = cast( |
|||
PPOSettings, trainer_settings.hyperparameters |
|||
) |
|||
self.decay_learning_rate = ModelUtils.DecayedValue( |
|||
self.hyperparameters.learning_rate_schedule, |
|||
self.hyperparameters.learning_rate, |
|||
1e-10, |
|||
self.trainer_settings.max_steps, |
|||
) |
|||
self.decay_epsilon = ModelUtils.DecayedValue( |
|||
self.hyperparameters.learning_rate_schedule, |
|||
self.hyperparameters.epsilon, |
|||
0.1, |
|||
self.trainer_settings.max_steps, |
|||
) |
|||
self.decay_beta = ModelUtils.DecayedValue( |
|||
self.hyperparameters.learning_rate_schedule, |
|||
self.hyperparameters.beta, |
|||
1e-5, |
|||
self.trainer_settings.max_steps, |
|||
) |
|||
|
|||
self.optimizer = torch.optim.Adam( |
|||
params, lr=self.trainer_settings.hyperparameters.learning_rate |
|||
) |
|||
self.stats_name_to_update_name = { |
|||
"Losses/Value Loss": "value_loss", |
|||
"Losses/Policy Loss": "policy_loss", |
|||
} |
|||
|
|||
self.stream_names = list(self.reward_signals.keys()) |
|||
self.value_memory_dict: Dict[str, torch.Tensor] = {} |
|||
self.baseline_memory_dict: Dict[str, torch.Tensor] = {} |
|||
|
|||
def create_reward_signals( |
|||
self, reward_signal_configs: Dict[RewardSignalType, RewardSignalSettings] |
|||
) -> None: |
|||
""" |
|||
Create reward signals. Override default to provide warnings for Curiosity and |
|||
GAIL, and make sure Extrinsic adds team rewards. |
|||
:param reward_signal_configs: Reward signal config. |
|||
""" |
|||
for reward_signal, settings in reward_signal_configs.items(): |
|||
if reward_signal != RewardSignalType.EXTRINSIC: |
|||
logger.warning( |
|||
f"Reward Signal {reward_signal.value} is not supported with the COMA2 trainer; \ |
|||
results may be unexpected." |
|||
) |
|||
elif isinstance(settings, ExtrinsicSettings): |
|||
settings.add_groupmate_rewards = True |
|||
super().create_reward_signals(reward_signal_configs) |
|||
|
|||
@property |
|||
def critic(self): |
|||
return self._critic |
|||
|
|||
def coma_value_loss( |
|||
self, |
|||
values: Dict[str, torch.Tensor], |
|||
old_values: Dict[str, torch.Tensor], |
|||
returns: Dict[str, torch.Tensor], |
|||
epsilon: float, |
|||
loss_masks: torch.Tensor, |
|||
) -> torch.Tensor: |
|||
""" |
|||
Evaluates value loss for PPO. |
|||
:param values: Value output of the current network. |
|||
:param old_values: Value stored with experiences in buffer. |
|||
:param returns: Computed returns. |
|||
:param epsilon: Clipping value for value estimate. |
|||
:param loss_mask: Mask for losses. Used with LSTM to ignore 0'ed out experiences. |
|||
""" |
|||
value_losses = [] |
|||
for name, head in values.items(): |
|||
old_val_tensor = old_values[name] |
|||
returns_tensor = returns[name] |
|||
clipped_value_estimate = old_val_tensor + torch.clamp( |
|||
head - old_val_tensor, -1 * epsilon, epsilon |
|||
) |
|||
v_opt_a = (returns_tensor - head) ** 2 |
|||
v_opt_b = (returns_tensor - clipped_value_estimate) ** 2 |
|||
value_loss = ModelUtils.masked_mean(torch.max(v_opt_a, v_opt_b), loss_masks) |
|||
value_losses.append(value_loss) |
|||
value_loss = torch.mean(torch.stack(value_losses)) |
|||
return value_loss |
|||
|
|||
def ppo_policy_loss( |
|||
self, |
|||
advantages: torch.Tensor, |
|||
log_probs: torch.Tensor, |
|||
old_log_probs: torch.Tensor, |
|||
loss_masks: torch.Tensor, |
|||
) -> torch.Tensor: |
|||
""" |
|||
Evaluate PPO policy loss. |
|||
:param advantages: Computed advantages. |
|||
:param log_probs: Current policy probabilities |
|||
:param old_log_probs: Past policy probabilities |
|||
:param loss_masks: Mask for losses. Used with LSTM to ignore 0'ed out experiences. |
|||
""" |
|||
advantage = advantages.unsqueeze(-1) |
|||
|
|||
decay_epsilon = self.hyperparameters.epsilon |
|||
r_theta = torch.exp(log_probs - old_log_probs) |
|||
p_opt_a = r_theta * advantage |
|||
p_opt_b = ( |
|||
torch.clamp(r_theta, 1.0 - decay_epsilon, 1.0 + decay_epsilon) * advantage |
|||
) |
|||
policy_loss = -1 * ModelUtils.masked_mean( |
|||
torch.min(p_opt_a, p_opt_b), loss_masks |
|||
) |
|||
return policy_loss |
|||
|
|||
@timed |
|||
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: |
|||
""" |
|||
Performs update on model. |
|||
:param batch: Batch of experiences. |
|||
:param num_sequences: Number of sequences to process. |
|||
:return: Results of update. |
|||
""" |
|||
# Get decayed parameters |
|||
decay_lr = self.decay_learning_rate.get_value(self.policy.get_current_step()) |
|||
decay_eps = self.decay_epsilon.get_value(self.policy.get_current_step()) |
|||
decay_bet = self.decay_beta.get_value(self.policy.get_current_step()) |
|||
returns = {} |
|||
old_values = {} |
|||
old_baseline_values = {} |
|||
for name in self.reward_signals: |
|||
old_values[name] = ModelUtils.list_to_tensor( |
|||
batch[RewardSignalUtil.value_estimates_key(name)] |
|||
) |
|||
returns[name] = ModelUtils.list_to_tensor( |
|||
batch[RewardSignalUtil.returns_key(name)] |
|||
) |
|||
old_baseline_values[name] = ModelUtils.list_to_tensor( |
|||
batch[RewardSignalUtil.baseline_estimates_key(name)] |
|||
) |
|||
|
|||
n_obs = len(self.policy.behavior_spec.observation_specs) |
|||
current_obs = ObsUtil.from_buffer(batch, n_obs) |
|||
# Convert to tensors |
|||
current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs] |
|||
group_obs = GroupObsUtil.from_buffer(batch, n_obs) |
|||
group_obs = [ |
|||
[ModelUtils.list_to_tensor(obs) for obs in _groupmate_obs] |
|||
for _groupmate_obs in group_obs |
|||
] |
|||
|
|||
act_masks = ModelUtils.list_to_tensor(batch[BufferKey.ACTION_MASK]) |
|||
actions = AgentAction.from_buffer(batch) |
|||
group_actions = AgentAction.group_from_buffer(batch) |
|||
|
|||
memories = [ |
|||
ModelUtils.list_to_tensor(batch[BufferKey.MEMORY][i]) |
|||
for i in range(0, len(batch[BufferKey.MEMORY]), self.policy.sequence_length) |
|||
] |
|||
if len(memories) > 0: |
|||
memories = torch.stack(memories).unsqueeze(0) |
|||
value_memories = [ |
|||
ModelUtils.list_to_tensor(batch[BufferKey.CRITIC_MEMORY][i]) |
|||
for i in range( |
|||
0, len(batch[BufferKey.CRITIC_MEMORY]), self.policy.sequence_length |
|||
) |
|||
] |
|||
|
|||
baseline_memories = [ |
|||
ModelUtils.list_to_tensor(batch[BufferKey.BASELINE_MEMORY][i]) |
|||
for i in range( |
|||
0, len(batch[BufferKey.BASELINE_MEMORY]), self.policy.sequence_length |
|||
) |
|||
] |
|||
|
|||
if len(value_memories) > 0: |
|||
value_memories = torch.stack(value_memories).unsqueeze(0) |
|||
baseline_memories = torch.stack(baseline_memories).unsqueeze(0) |
|||
|
|||
log_probs, entropy = self.policy.evaluate_actions( |
|||
current_obs, |
|||
masks=act_masks, |
|||
actions=actions, |
|||
memories=memories, |
|||
seq_len=self.policy.sequence_length, |
|||
) |
|||
all_obs = [current_obs] + group_obs |
|||
values, _ = self.critic.critic_pass( |
|||
all_obs, |
|||
memories=value_memories, |
|||
sequence_length=self.policy.sequence_length, |
|||
) |
|||
baselines, _ = self.critic.baseline( |
|||
[current_obs], |
|||
group_obs, |
|||
group_actions, |
|||
memories=baseline_memories, |
|||
sequence_length=self.policy.sequence_length, |
|||
) |
|||
old_log_probs = ActionLogProbs.from_buffer(batch).flatten() |
|||
log_probs = log_probs.flatten() |
|||
loss_masks = ModelUtils.list_to_tensor(batch[BufferKey.MASKS], dtype=torch.bool) |
|||
|
|||
baseline_loss = self.coma_value_loss( |
|||
baselines, old_baseline_values, returns, decay_eps, loss_masks |
|||
) |
|||
value_loss = self.coma_value_loss( |
|||
values, old_values, returns, decay_eps, loss_masks |
|||
) |
|||
policy_loss = self.ppo_policy_loss( |
|||
ModelUtils.list_to_tensor(batch[BufferKey.ADVANTAGES]), |
|||
log_probs, |
|||
old_log_probs, |
|||
loss_masks, |
|||
) |
|||
loss = ( |
|||
policy_loss |
|||
+ 0.5 * (value_loss + 0.5 * baseline_loss) |
|||
- decay_bet * ModelUtils.masked_mean(entropy, loss_masks) |
|||
) |
|||
|
|||
# Set optimizer learning rate |
|||
ModelUtils.update_learning_rate(self.optimizer, decay_lr) |
|||
self.optimizer.zero_grad() |
|||
loss.backward() |
|||
|
|||
self.optimizer.step() |
|||
update_stats = { |
|||
# NOTE: abs() is not technically correct, but matches the behavior in TensorFlow. |
|||
# TODO: After PyTorch is default, change to something more correct. |
|||
"Losses/Policy Loss": torch.abs(policy_loss).item(), |
|||
"Losses/Value Loss": value_loss.item(), |
|||
"Losses/Baseline Loss": baseline_loss.item(), |
|||
"Policy/Learning Rate": decay_lr, |
|||
"Policy/Epsilon": decay_eps, |
|||
"Policy/Beta": decay_bet, |
|||
} |
|||
|
|||
for reward_provider in self.reward_signals.values(): |
|||
update_stats.update(reward_provider.update(batch)) |
|||
|
|||
return update_stats |
|||
|
|||
def get_modules(self): |
|||
modules = {"Optimizer": self.optimizer} |
|||
for reward_provider in self.reward_signals.values(): |
|||
modules.update(reward_provider.get_modules()) |
|||
return modules |
|||
|
|||
def _evaluate_by_sequence_team( |
|||
self, |
|||
self_obs: List[torch.Tensor], |
|||
obs: List[List[torch.Tensor]], |
|||
actions: List[AgentAction], |
|||
init_value_mem: torch.Tensor, |
|||
init_baseline_mem: torch.Tensor, |
|||
) -> Tuple[ |
|||
Dict[str, torch.Tensor], |
|||
Dict[str, torch.Tensor], |
|||
AgentBufferField, |
|||
AgentBufferField, |
|||
torch.Tensor, |
|||
torch.Tensor, |
|||
]: |
|||
""" |
|||
Evaluate a trajectory sequence-by-sequence, assembling the result. This enables us to get the |
|||
intermediate memories for the critic. |
|||
:param tensor_obs: A List of tensors of shape (trajectory_len, <obs_dim>) that are the agent's |
|||
observations for this trajectory. |
|||
:param initial_memory: The memory that preceeds this trajectory. Of shape (1,1,<mem_size>), i.e. |
|||
what is returned as the output of a MemoryModules. |
|||
:return: A Tuple of the value estimates as a Dict of [name, tensor], an AgentBufferField of the initial |
|||
memories to be used during value function update, and the final memory at the end of the trajectory. |
|||
""" |
|||
num_experiences = self_obs[0].shape[0] |
|||
all_next_value_mem = AgentBufferField() |
|||
all_next_baseline_mem = AgentBufferField() |
|||
# In the buffer, the 1st sequence are the ones that are padded. So if seq_len = 3 and |
|||
# trajectory is of length 10, the 1st sequence is [pad,pad,obs]. |
|||
# Compute the number of elements in this padded seq. |
|||
leftover = num_experiences % self.policy.sequence_length |
|||
|
|||
# Compute values for the potentially truncated initial sequence |
|||
|
|||
first_seq_len = leftover if leftover > 0 else self.policy.sequence_length |
|||
|
|||
self_seq_obs = [] |
|||
team_seq_obs = [] |
|||
team_seq_act = [] |
|||
seq_obs = [] |
|||
for _self_obs in self_obs: |
|||
first_seq_obs = _self_obs[0:first_seq_len] |
|||
seq_obs.append(first_seq_obs) |
|||
self_seq_obs.append(seq_obs) |
|||
|
|||
for team_obs, team_action in zip(obs, actions): |
|||
seq_obs = [] |
|||
for _obs in team_obs: |
|||
first_seq_obs = _obs[0:first_seq_len] |
|||
seq_obs.append(first_seq_obs) |
|||
team_seq_obs.append(seq_obs) |
|||
_act = team_action.slice(0, first_seq_len) |
|||
team_seq_act.append(_act) |
|||
|
|||
# For the first sequence, the initial memory should be the one at the |
|||
# beginning of this trajectory. |
|||
for _ in range(first_seq_len): |
|||
all_next_value_mem.append(ModelUtils.to_numpy(init_value_mem.squeeze())) |
|||
all_next_baseline_mem.append( |
|||
ModelUtils.to_numpy(init_baseline_mem.squeeze()) |
|||
) |
|||
|
|||
all_seq_obs = self_seq_obs + team_seq_obs |
|||
init_values, _value_mem = self.critic.critic_pass( |
|||
all_seq_obs, init_value_mem, sequence_length=first_seq_len |
|||
) |
|||
all_values = { |
|||
signal_name: [init_values[signal_name]] |
|||
for signal_name in init_values.keys() |
|||
} |
|||
|
|||
init_baseline, _baseline_mem = self.critic.baseline( |
|||
self_seq_obs, |
|||
team_seq_obs, |
|||
team_seq_act, |
|||
init_baseline_mem, |
|||
sequence_length=first_seq_len, |
|||
) |
|||
all_baseline = { |
|||
signal_name: [init_baseline[signal_name]] |
|||
for signal_name in init_baseline.keys() |
|||
} |
|||
|
|||
# Evaluate other trajectories, carrying over _mem after each |
|||
# trajectory |
|||
for seq_num in range( |
|||
1, math.ceil((num_experiences) / (self.policy.sequence_length)) |
|||
): |
|||
for _ in range(self.policy.sequence_length): |
|||
all_next_value_mem.append(ModelUtils.to_numpy(_value_mem.squeeze())) |
|||
all_next_baseline_mem.append( |
|||
ModelUtils.to_numpy(_baseline_mem.squeeze()) |
|||
) |
|||
|
|||
start = seq_num * self.policy.sequence_length - ( |
|||
self.policy.sequence_length - leftover |
|||
) |
|||
end = (seq_num + 1) * self.policy.sequence_length - ( |
|||
self.policy.sequence_length - leftover |
|||
) |
|||
|
|||
self_seq_obs = [] |
|||
team_seq_obs = [] |
|||
team_seq_act = [] |
|||
seq_obs = [] |
|||
for _self_obs in self_obs: |
|||
seq_obs.append(_obs[start:end]) |
|||
self_seq_obs.append(seq_obs) |
|||
|
|||
for team_obs, team_action in zip(obs, actions): |
|||
seq_obs = [] |
|||
for (_obs,) in team_obs: |
|||
first_seq_obs = _obs[start:end] |
|||
seq_obs.append(first_seq_obs) |
|||
team_seq_obs.append(seq_obs) |
|||
_act = team_action.slice(start, end) |
|||
team_seq_act.append(_act) |
|||
|
|||
all_seq_obs = self_seq_obs + team_seq_obs |
|||
values, _value_mem = self.critic.critic_pass( |
|||
all_seq_obs, _value_mem, sequence_length=self.policy.sequence_length |
|||
) |
|||
all_values = { |
|||
signal_name: [init_values[signal_name]] for signal_name in values.keys() |
|||
} |
|||
|
|||
baselines, _baseline_mem = self.critic.baseline( |
|||
self_seq_obs, |
|||
team_seq_obs, |
|||
team_seq_act, |
|||
_baseline_mem, |
|||
sequence_length=first_seq_len, |
|||
) |
|||
all_baseline = { |
|||
signal_name: [baselines[signal_name]] |
|||
for signal_name in baselines.keys() |
|||
} |
|||
# Create one tensor per reward signal |
|||
all_value_tensors = { |
|||
signal_name: torch.cat(value_list, dim=0) |
|||
for signal_name, value_list in all_values.items() |
|||
} |
|||
all_baseline_tensors = { |
|||
signal_name: torch.cat(baseline_list, dim=0) |
|||
for signal_name, baseline_list in all_baseline.items() |
|||
} |
|||
next_value_mem = _value_mem |
|||
next_baseline_mem = _baseline_mem |
|||
return ( |
|||
all_value_tensors, |
|||
all_baseline_tensors, |
|||
all_next_value_mem, |
|||
all_next_baseline_mem, |
|||
next_value_mem, |
|||
next_baseline_mem, |
|||
) |
|||
|
|||
def get_trajectory_and_baseline_value_estimates( |
|||
self, |
|||
batch: AgentBuffer, |
|||
next_obs: List[np.ndarray], |
|||
next_group_obs: List[List[np.ndarray]], |
|||
done: bool, |
|||
agent_id: str = "", |
|||
) -> Tuple[ |
|||
Dict[str, np.ndarray], |
|||
Dict[str, np.ndarray], |
|||
Dict[str, float], |
|||
Optional[AgentBufferField], |
|||
Optional[AgentBufferField], |
|||
]: |
|||
|
|||
n_obs = len(self.policy.behavior_spec.observation_specs) |
|||
|
|||
current_obs = ObsUtil.from_buffer(batch, n_obs) |
|||
team_obs = GroupObsUtil.from_buffer(batch, n_obs) |
|||
|
|||
current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs] |
|||
team_obs = [ |
|||
[ModelUtils.list_to_tensor(obs) for obs in _teammate_obs] |
|||
for _teammate_obs in team_obs |
|||
] |
|||
|
|||
team_actions = AgentAction.group_from_buffer(batch) |
|||
|
|||
next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs] |
|||
next_obs = [obs.unsqueeze(0) for obs in next_obs] |
|||
|
|||
next_group_obs = [ |
|||
ModelUtils.list_to_tensor_list(_list_obs) for _list_obs in next_group_obs |
|||
] |
|||
# Expand dimensions of next critic obs |
|||
next_group_obs = [ |
|||
[_obs.unsqueeze(0) for _obs in _list_obs] for _list_obs in next_group_obs |
|||
] |
|||
|
|||
if agent_id in self.value_memory_dict: |
|||
# The agent_id should always be in both since they are added together |
|||
_init_value_mem = self.value_memory_dict[agent_id] |
|||
_init_baseline_mem = self.baseline_memory_dict[agent_id] |
|||
else: |
|||
_init_value_mem = ( |
|||
torch.zeros((1, 1, self.critic.memory_size)) |
|||
if self.policy.use_recurrent |
|||
else None |
|||
) |
|||
_init_baseline_mem = ( |
|||
torch.zeros((1, 1, self.critic.memory_size)) |
|||
if self.policy.use_recurrent |
|||
else None |
|||
) |
|||
|
|||
all_obs = [current_obs] + team_obs if team_obs is not None else [current_obs] |
|||
all_next_value_mem: Optional[AgentBufferField] = None |
|||
all_next_baseline_mem: Optional[AgentBufferField] = None |
|||
if self.policy.use_recurrent: |
|||
( |
|||
value_estimates, |
|||
baseline_estimates, |
|||
all_next_value_mem, |
|||
all_next_baseline_mem, |
|||
next_value_mem, |
|||
next_baseline_mem, |
|||
) = self._evaluate_by_sequence_team( |
|||
current_obs, team_obs, team_actions, _init_value_mem, _init_baseline_mem |
|||
) |
|||
else: |
|||
value_estimates, next_value_mem = self.critic.critic_pass( |
|||
all_obs, _init_value_mem, sequence_length=batch.num_experiences |
|||
) |
|||
|
|||
baseline_estimates, next_baseline_mem = self.critic.baseline( |
|||
[current_obs], |
|||
team_obs, |
|||
team_actions, |
|||
_init_baseline_mem, |
|||
sequence_length=batch.num_experiences, |
|||
) |
|||
# Store the memory for the next trajectory |
|||
self.value_memory_dict[agent_id] = next_value_mem |
|||
self.baseline_memory_dict[agent_id] = next_baseline_mem |
|||
|
|||
all_next_obs = ( |
|||
[next_obs] + next_group_obs if next_group_obs is not None else [next_obs] |
|||
) |
|||
|
|||
next_value_estimates, _ = self.critic.critic_pass( |
|||
all_next_obs, next_value_mem, sequence_length=1 |
|||
) |
|||
|
|||
for name, estimate in baseline_estimates.items(): |
|||
baseline_estimates[name] = ModelUtils.to_numpy(estimate) |
|||
|
|||
for name, estimate in value_estimates.items(): |
|||
value_estimates[name] = ModelUtils.to_numpy(estimate) |
|||
|
|||
# the base line and V shpuld not be on the same done flag |
|||
for name, estimate in next_value_estimates.items(): |
|||
next_value_estimates[name] = ModelUtils.to_numpy(estimate) |
|||
|
|||
if done: |
|||
for k in next_value_estimates: |
|||
if not self.reward_signals[k].ignore_done: |
|||
next_value_estimates[k][-1] = 0.0 |
|||
|
|||
return ( |
|||
value_estimates, |
|||
baseline_estimates, |
|||
next_value_estimates, |
|||
all_next_value_mem, |
|||
all_next_baseline_mem, |
|||
) |
|
|||
fileFormatVersion: 2 |
|||
guid: 5661ffdb6c7704e84bc785572dcd5bd1 |
|||
MonoImporter: |
|||
externalObjects: {} |
|||
serializedVersion: 2 |
|||
defaultReferences: [] |
|||
executionOrder: 0 |
|||
icon: {instanceID: 0} |
|||
userData: |
|||
assetBundleName: |
|||
assetBundleVariant: |
撰写
预览
正在加载...
取消
保存
Reference in new issue