比较提交
合并到: unity-tech-cn:main
unity-tech-cn:/main
unity-tech-cn:/develop-generalizationTraining-TrainerController
unity-tech-cn:/tag-0.2.0
unity-tech-cn:/tag-0.2.1
unity-tech-cn:/tag-0.2.1a
unity-tech-cn:/tag-0.2.1c
unity-tech-cn:/tag-0.2.1d
unity-tech-cn:/hotfix-v0.9.2a
unity-tech-cn:/develop-gpu-test
unity-tech-cn:/0.10.1
unity-tech-cn:/develop-pyinstaller
unity-tech-cn:/develop-horovod
unity-tech-cn:/PhysXArticulations20201
unity-tech-cn:/importdocfix
unity-tech-cn:/develop-resizetexture
unity-tech-cn:/hh-develop-walljump_bugfixes
unity-tech-cn:/develop-walljump-fix-sac
unity-tech-cn:/hh-develop-walljump_rnd
unity-tech-cn:/tag-0.11.0.dev0
unity-tech-cn:/develop-pytorch
unity-tech-cn:/tag-0.11.0.dev2
unity-tech-cn:/develop-newnormalization
unity-tech-cn:/tag-0.11.0.dev3
unity-tech-cn:/develop
unity-tech-cn:/release-0.12.0
unity-tech-cn:/tag-0.12.0-dev
unity-tech-cn:/tag-0.12.0.dev0
unity-tech-cn:/tag-0.12.1
unity-tech-cn:/2D-explorations
unity-tech-cn:/asymm-envs
unity-tech-cn:/tag-0.12.1.dev0
unity-tech-cn:/2D-exploration-raycast
unity-tech-cn:/tag-0.12.1.dev1
unity-tech-cn:/release-0.13.0
unity-tech-cn:/release-0.13.1
unity-tech-cn:/plugin-proof-of-concept
unity-tech-cn:/release-0.14.0
unity-tech-cn:/hotfix-bump-version-master
unity-tech-cn:/soccer-fives
unity-tech-cn:/release-0.14.1
unity-tech-cn:/bug-failed-api-check
unity-tech-cn:/test-recurrent-gail
unity-tech-cn:/hh-add-icons
unity-tech-cn:/release-0.15.0
unity-tech-cn:/release-0.15.1
unity-tech-cn:/hh-develop-all-posed-characters
unity-tech-cn:/internal-policy-ghost
unity-tech-cn:/distributed-training
unity-tech-cn:/hh-develop-improve_tennis
unity-tech-cn:/test-tf-ver
unity-tech-cn:/release_1_branch
unity-tech-cn:/tennis-time-horizon
unity-tech-cn:/whitepaper-experiments
unity-tech-cn:/r2v-yamato-linux
unity-tech-cn:/docs-update
unity-tech-cn:/release_2_branch
unity-tech-cn:/exp-mede
unity-tech-cn:/sensitivity
unity-tech-cn:/release_2_verified_load_fix
unity-tech-cn:/test-sampler
unity-tech-cn:/release_2_verified
unity-tech-cn:/hh-develop-ragdoll-testing
unity-tech-cn:/origin-develop-taggedobservations
unity-tech-cn:/MLA-1734-demo-provider
unity-tech-cn:/sampler-refactor-copy
unity-tech-cn:/PhysXArticulations20201Package
unity-tech-cn:/tag-com.unity.ml-agents_1.0.8
unity-tech-cn:/release_3_branch
unity-tech-cn:/github-actions
unity-tech-cn:/release_3_distributed
unity-tech-cn:/fix-batch-tennis
unity-tech-cn:/distributed-ppo-sac
unity-tech-cn:/gridworld-custom-obs
unity-tech-cn:/hw20-segmentation
unity-tech-cn:/hh-develop-gamedev-demo
unity-tech-cn:/active-variablespeed
unity-tech-cn:/release_4_branch
unity-tech-cn:/fix-env-step-loop
unity-tech-cn:/release_5_branch
unity-tech-cn:/fix-walker
unity-tech-cn:/release_6_branch
unity-tech-cn:/hh-32-observation-crawler
unity-tech-cn:/trainer-plugin
unity-tech-cn:/hh-develop-max-steps-demo-recorder
unity-tech-cn:/hh-develop-loco-walker-variable-speed
unity-tech-cn:/exp-0002
unity-tech-cn:/experiment-less-max-step
unity-tech-cn:/hh-develop-hallway-wall-mesh-fix
unity-tech-cn:/release_7_branch
unity-tech-cn:/exp-vince
unity-tech-cn:/hh-develop-gridsensor-tests
unity-tech-cn:/tag-release_8_test0
unity-tech-cn:/tag-release_8_test1
unity-tech-cn:/release_8_branch
unity-tech-cn:/docfix-end-episode
unity-tech-cn:/release_9_branch
unity-tech-cn:/hybrid-action-rewardsignals
unity-tech-cn:/MLA-462-yamato-win
unity-tech-cn:/exp-alternate-atten
unity-tech-cn:/hh-develop-fps_game_project
unity-tech-cn:/fix-conflict-base-env
unity-tech-cn:/release_10_branch
unity-tech-cn:/exp-bullet-hell-trainer
unity-tech-cn:/ai-summit-exp
unity-tech-cn:/comms-grad
unity-tech-cn:/walljump-pushblock
unity-tech-cn:/goal-conditioning
unity-tech-cn:/release_11_branch
unity-tech-cn:/hh-develop-water-balloon-fight
unity-tech-cn:/gc-hyper
unity-tech-cn:/layernorm
unity-tech-cn:/yamato-linux-debug-venv
unity-tech-cn:/soccer-comms
unity-tech-cn:/hh-develop-pushblockcollab
unity-tech-cn:/release_12_branch
unity-tech-cn:/fix-get-step-sp-curr
unity-tech-cn:/continuous-comms
unity-tech-cn:/no-comms
unity-tech-cn:/hh-develop-zombiepushblock
unity-tech-cn:/hypernetwork
unity-tech-cn:/revert-4859-develop-update-readme
unity-tech-cn:/sequencer-env-attention
unity-tech-cn:/hh-develop-variableobs
unity-tech-cn:/exp-tanh
unity-tech-cn:/reward-dist
unity-tech-cn:/exp-weight-decay
unity-tech-cn:/exp-robot
unity-tech-cn:/bullet-hell-barracuda-test-1.3.1
unity-tech-cn:/release_13_branch
unity-tech-cn:/release_14_branch
unity-tech-cn:/exp-clipped-gaussian-entropy
unity-tech-cn:/tic-tac-toe
unity-tech-cn:/hh-develop-dodgeball
unity-tech-cn:/repro-vis-obs-perf
unity-tech-cn:/v2-staging-rebase
unity-tech-cn:/release_15_branch
unity-tech-cn:/release_15_removeendepisode
unity-tech-cn:/release_16_branch
unity-tech-cn:/release_16_fix_gridsensor
unity-tech-cn:/ai-hw-2021
unity-tech-cn:/check-for-ModelOverriders
unity-tech-cn:/fix-grid-obs-shape-init
unity-tech-cn:/fix-gym-needs-reset
unity-tech-cn:/fix-resume-imi
unity-tech-cn:/release_17_branch
unity-tech-cn:/release_17_branch_gpu_test
unity-tech-cn:/colab-links
unity-tech-cn:/exp-continuous-div
unity-tech-cn:/release_17_branch_gpu_2
unity-tech-cn:/exp-diverse-behavior
unity-tech-cn:/grid-onehot-extra-dim-empty
unity-tech-cn:/2.0-verified
unity-tech-cn:/faster-entropy-coeficient-convergence
unity-tech-cn:/pre-r18-update-changelog
unity-tech-cn:/release_18_branch
unity-tech-cn:/main/tracking
unity-tech-cn:/main/reward-providers
unity-tech-cn:/main/project-upgrade
unity-tech-cn:/main/limitation-docs
unity-tech-cn:/develop/nomaxstep-test
unity-tech-cn:/develop/tf2.0
unity-tech-cn:/develop/tanhsquash
unity-tech-cn:/develop/magic-string
unity-tech-cn:/develop/trainerinterface
unity-tech-cn:/develop/separatevalue
unity-tech-cn:/develop/nopreviousactions
unity-tech-cn:/develop/reenablerepeatactions
unity-tech-cn:/develop/0memories
unity-tech-cn:/develop/fixmemoryleak
unity-tech-cn:/develop/reducewalljump
unity-tech-cn:/develop/removeactionholder-onehot
unity-tech-cn:/develop/canonicalize-quaternions
unity-tech-cn:/develop/self-playassym
unity-tech-cn:/develop/demo-load-seek
unity-tech-cn:/develop/progress-bar
unity-tech-cn:/develop/sac-apex
unity-tech-cn:/develop/cubewars
unity-tech-cn:/develop/add-fire
unity-tech-cn:/develop/gym-wrapper
unity-tech-cn:/develop/mm-docs-main-readme
unity-tech-cn:/develop/mm-docs-overview
unity-tech-cn:/develop/no-threading
unity-tech-cn:/develop/dockerfile
unity-tech-cn:/develop/model-store
unity-tech-cn:/develop/checkout-conversion-rebase
unity-tech-cn:/develop/model-transfer
unity-tech-cn:/develop/bisim-review
unity-tech-cn:/develop/taggedobservations
unity-tech-cn:/develop/transfer-bisim
unity-tech-cn:/develop/bisim-sac-transfer
unity-tech-cn:/develop/basketball
unity-tech-cn:/develop/torchmodules
unity-tech-cn:/develop/fixmarkdown
unity-tech-cn:/develop/shortenstrikervsgoalie
unity-tech-cn:/develop/shortengoalie
unity-tech-cn:/develop/torch-save-rp
unity-tech-cn:/develop/torch-to-np
unity-tech-cn:/develop/torch-omp-no-thread
unity-tech-cn:/develop/actionmodel-csharp
unity-tech-cn:/develop/torch-extra
unity-tech-cn:/develop/restructure-torch-networks
unity-tech-cn:/develop/jit
unity-tech-cn:/develop/adjust-cpu-settings-experiment
unity-tech-cn:/develop/torch-sac-threading
unity-tech-cn:/develop/wb
unity-tech-cn:/develop/amrl
unity-tech-cn:/develop/memorydump
unity-tech-cn:/develop/permutepytorch
unity-tech-cn:/develop/sac-targetq
unity-tech-cn:/develop/actions-out
unity-tech-cn:/develop/reshapeonnxmemories
unity-tech-cn:/develop/crawlergail
unity-tech-cn:/develop/debugtorchfood
unity-tech-cn:/develop/hybrid-actions
unity-tech-cn:/develop/bullet-hell
unity-tech-cn:/develop/action-spec-gym
unity-tech-cn:/develop/battlefoodcollector
unity-tech-cn:/develop/use-action-buffers
unity-tech-cn:/develop/hardswish
unity-tech-cn:/develop/leakyrelu
unity-tech-cn:/develop/torch-clip-scale
unity-tech-cn:/develop/contentropy
unity-tech-cn:/develop/manch
unity-tech-cn:/develop/torchcrawlerdebug
unity-tech-cn:/develop/fix-nan
unity-tech-cn:/develop/multitype-buffer
unity-tech-cn:/develop/windows-delay
unity-tech-cn:/develop/torch-tanh
unity-tech-cn:/develop/gail-norm
unity-tech-cn:/develop/multiprocess
unity-tech-cn:/develop/unified-obs
unity-tech-cn:/develop/rm-rf-new-models
unity-tech-cn:/develop/skipcritic
unity-tech-cn:/develop/centralizedcritic
unity-tech-cn:/develop/dodgeball-tests
unity-tech-cn:/develop/cc-teammanager
unity-tech-cn:/develop/weight-decay
unity-tech-cn:/develop/singular-embeddings
unity-tech-cn:/develop/zombieteammanager
unity-tech-cn:/develop/superpush
unity-tech-cn:/develop/teammanager
unity-tech-cn:/develop/zombie-exp
unity-tech-cn:/develop/update-readme
unity-tech-cn:/develop/readme-fix
unity-tech-cn:/develop/coma-noact
unity-tech-cn:/develop/coma-withq
unity-tech-cn:/develop/coma2
unity-tech-cn:/develop/action-slice
unity-tech-cn:/develop/gru
unity-tech-cn:/develop/critic-op-lstm-currentmem
unity-tech-cn:/develop/decaygail
unity-tech-cn:/develop/gail-srl-hack
unity-tech-cn:/develop/rear-pad
unity-tech-cn:/develop/mm-copyright-dates
unity-tech-cn:/develop/dodgeball-raycasts
unity-tech-cn:/develop/collab-envs-exp-ervin
unity-tech-cn:/develop/pushcollabonly
unity-tech-cn:/develop/sample-curation
unity-tech-cn:/develop/soccer-groupman
unity-tech-cn:/develop/input-actuator-tanks
unity-tech-cn:/develop/validate-release-fix
unity-tech-cn:/develop/new-console-log
unity-tech-cn:/develop/lex-walker-model
unity-tech-cn:/develop/lstm-burnin
unity-tech-cn:/develop/grid-vaiable-names
unity-tech-cn:/develop/fix-attn-embedding
unity-tech-cn:/develop/api-documentation-update-some-fixes
unity-tech-cn:/develop/update-grpc
unity-tech-cn:/develop/grid-rootref-debug
unity-tech-cn:/develop/pbcollab-rays
unity-tech-cn:/develop/2.0-verified-pre
unity-tech-cn:/develop/parameterizedenvs
unity-tech-cn:/develop/custom-ray-sensor
unity-tech-cn:/develop/mm-add-v2blog
unity-tech-cn:/develop/custom-raycast
unity-tech-cn:/develop/area-manager
unity-tech-cn:/develop/remove-unecessary-lr
unity-tech-cn:/develop/use-base-env-in-learn
unity-tech-cn:/soccer-fives/multiagent
unity-tech-cn:/develop/cubewars/splashdamage
unity-tech-cn:/develop/add-fire/exp
unity-tech-cn:/develop/add-fire/jit
unity-tech-cn:/develop/add-fire/speedtest
unity-tech-cn:/develop/add-fire/bc
unity-tech-cn:/develop/add-fire/ckpt-2
unity-tech-cn:/develop/add-fire/normalize-context
unity-tech-cn:/develop/add-fire/components-dir
unity-tech-cn:/develop/add-fire/halfentropy
unity-tech-cn:/develop/add-fire/memoryclass
unity-tech-cn:/develop/add-fire/categoricaldist
unity-tech-cn:/develop/add-fire/mm
unity-tech-cn:/develop/add-fire/sac-lst
unity-tech-cn:/develop/add-fire/mm3
unity-tech-cn:/develop/add-fire/continuous
unity-tech-cn:/develop/add-fire/ghost
unity-tech-cn:/develop/add-fire/policy-tests
unity-tech-cn:/develop/add-fire/export-discrete
unity-tech-cn:/develop/add-fire/test-simple-rl-fix-resnet
unity-tech-cn:/develop/add-fire/remove-currdoc
unity-tech-cn:/develop/add-fire/clean2
unity-tech-cn:/develop/add-fire/doc-cleanups
unity-tech-cn:/develop/add-fire/changelog
unity-tech-cn:/develop/add-fire/mm2
unity-tech-cn:/develop/model-transfer/add-physics
unity-tech-cn:/develop/model-transfer/train
unity-tech-cn:/develop/jit/experiments
unity-tech-cn:/exp-vince/sep30-2020
unity-tech-cn:/hh-develop-gridsensor-tests/static
unity-tech-cn:/develop/hybrid-actions/distlist
unity-tech-cn:/develop/bullet-hell/buffer
unity-tech-cn:/goal-conditioning/new
unity-tech-cn:/goal-conditioning/sensors-2
unity-tech-cn:/goal-conditioning/sensors-3-pytest-fix
unity-tech-cn:/goal-conditioning/grid-world
unity-tech-cn:/soccer-comms/disc
unity-tech-cn:/develop/centralizedcritic/counterfact
unity-tech-cn:/develop/centralizedcritic/mm
unity-tech-cn:/develop/centralizedcritic/nonego
unity-tech-cn:/develop/zombieteammanager/disableagent
unity-tech-cn:/develop/zombieteammanager/killfirst
unity-tech-cn:/develop/superpush/int
unity-tech-cn:/develop/superpush/branch-cleanup
unity-tech-cn:/develop/teammanager/int
unity-tech-cn:/develop/teammanager/cubewar-nocycle
unity-tech-cn:/develop/teammanager/cubewars
unity-tech-cn:/develop/superpush/int/hunter
unity-tech-cn:/goal-conditioning/new/allo-crawler
unity-tech-cn:/develop/coma2/clip
unity-tech-cn:/develop/coma2/singlenetwork
unity-tech-cn:/develop/coma2/samenet
unity-tech-cn:/develop/coma2/fixgroup
unity-tech-cn:/develop/coma2/samenet/sum
unity-tech-cn:/hh-develop-dodgeball/goy-input
unity-tech-cn:/develop/soccer-groupman/mod
unity-tech-cn:/develop/soccer-groupman/mod/hunter
unity-tech-cn:/develop/soccer-groupman/mod/hunter/cine
unity-tech-cn:/ai-hw-2021/tensor-applier
拉取从: unity-tech-cn:develop/add-fire/speedtest
unity-tech-cn:/main
unity-tech-cn:/develop-generalizationTraining-TrainerController
unity-tech-cn:/tag-0.2.0
unity-tech-cn:/tag-0.2.1
unity-tech-cn:/tag-0.2.1a
unity-tech-cn:/tag-0.2.1c
unity-tech-cn:/tag-0.2.1d
unity-tech-cn:/hotfix-v0.9.2a
unity-tech-cn:/develop-gpu-test
unity-tech-cn:/0.10.1
unity-tech-cn:/develop-pyinstaller
unity-tech-cn:/develop-horovod
unity-tech-cn:/PhysXArticulations20201
unity-tech-cn:/importdocfix
unity-tech-cn:/develop-resizetexture
unity-tech-cn:/hh-develop-walljump_bugfixes
unity-tech-cn:/develop-walljump-fix-sac
unity-tech-cn:/hh-develop-walljump_rnd
unity-tech-cn:/tag-0.11.0.dev0
unity-tech-cn:/develop-pytorch
unity-tech-cn:/tag-0.11.0.dev2
unity-tech-cn:/develop-newnormalization
unity-tech-cn:/tag-0.11.0.dev3
unity-tech-cn:/develop
unity-tech-cn:/release-0.12.0
unity-tech-cn:/tag-0.12.0-dev
unity-tech-cn:/tag-0.12.0.dev0
unity-tech-cn:/tag-0.12.1
unity-tech-cn:/2D-explorations
unity-tech-cn:/asymm-envs
unity-tech-cn:/tag-0.12.1.dev0
unity-tech-cn:/2D-exploration-raycast
unity-tech-cn:/tag-0.12.1.dev1
unity-tech-cn:/release-0.13.0
unity-tech-cn:/release-0.13.1
unity-tech-cn:/plugin-proof-of-concept
unity-tech-cn:/release-0.14.0
unity-tech-cn:/hotfix-bump-version-master
unity-tech-cn:/soccer-fives
unity-tech-cn:/release-0.14.1
unity-tech-cn:/bug-failed-api-check
unity-tech-cn:/test-recurrent-gail
unity-tech-cn:/hh-add-icons
unity-tech-cn:/release-0.15.0
unity-tech-cn:/release-0.15.1
unity-tech-cn:/hh-develop-all-posed-characters
unity-tech-cn:/internal-policy-ghost
unity-tech-cn:/distributed-training
unity-tech-cn:/hh-develop-improve_tennis
unity-tech-cn:/test-tf-ver
unity-tech-cn:/release_1_branch
unity-tech-cn:/tennis-time-horizon
unity-tech-cn:/whitepaper-experiments
unity-tech-cn:/r2v-yamato-linux
unity-tech-cn:/docs-update
unity-tech-cn:/release_2_branch
unity-tech-cn:/exp-mede
unity-tech-cn:/sensitivity
unity-tech-cn:/release_2_verified_load_fix
unity-tech-cn:/test-sampler
unity-tech-cn:/release_2_verified
unity-tech-cn:/hh-develop-ragdoll-testing
unity-tech-cn:/origin-develop-taggedobservations
unity-tech-cn:/MLA-1734-demo-provider
unity-tech-cn:/sampler-refactor-copy
unity-tech-cn:/PhysXArticulations20201Package
unity-tech-cn:/tag-com.unity.ml-agents_1.0.8
unity-tech-cn:/release_3_branch
unity-tech-cn:/github-actions
unity-tech-cn:/release_3_distributed
unity-tech-cn:/fix-batch-tennis
unity-tech-cn:/distributed-ppo-sac
unity-tech-cn:/gridworld-custom-obs
unity-tech-cn:/hw20-segmentation
unity-tech-cn:/hh-develop-gamedev-demo
unity-tech-cn:/active-variablespeed
unity-tech-cn:/release_4_branch
unity-tech-cn:/fix-env-step-loop
unity-tech-cn:/release_5_branch
unity-tech-cn:/fix-walker
unity-tech-cn:/release_6_branch
unity-tech-cn:/hh-32-observation-crawler
unity-tech-cn:/trainer-plugin
unity-tech-cn:/hh-develop-max-steps-demo-recorder
unity-tech-cn:/hh-develop-loco-walker-variable-speed
unity-tech-cn:/exp-0002
unity-tech-cn:/experiment-less-max-step
unity-tech-cn:/hh-develop-hallway-wall-mesh-fix
unity-tech-cn:/release_7_branch
unity-tech-cn:/exp-vince
unity-tech-cn:/hh-develop-gridsensor-tests
unity-tech-cn:/tag-release_8_test0
unity-tech-cn:/tag-release_8_test1
unity-tech-cn:/release_8_branch
unity-tech-cn:/docfix-end-episode
unity-tech-cn:/release_9_branch
unity-tech-cn:/hybrid-action-rewardsignals
unity-tech-cn:/MLA-462-yamato-win
unity-tech-cn:/exp-alternate-atten
unity-tech-cn:/hh-develop-fps_game_project
unity-tech-cn:/fix-conflict-base-env
unity-tech-cn:/release_10_branch
unity-tech-cn:/exp-bullet-hell-trainer
unity-tech-cn:/ai-summit-exp
unity-tech-cn:/comms-grad
unity-tech-cn:/walljump-pushblock
unity-tech-cn:/goal-conditioning
unity-tech-cn:/release_11_branch
unity-tech-cn:/hh-develop-water-balloon-fight
unity-tech-cn:/gc-hyper
unity-tech-cn:/layernorm
unity-tech-cn:/yamato-linux-debug-venv
unity-tech-cn:/soccer-comms
unity-tech-cn:/hh-develop-pushblockcollab
unity-tech-cn:/release_12_branch
unity-tech-cn:/fix-get-step-sp-curr
unity-tech-cn:/continuous-comms
unity-tech-cn:/no-comms
unity-tech-cn:/hh-develop-zombiepushblock
unity-tech-cn:/hypernetwork
unity-tech-cn:/revert-4859-develop-update-readme
unity-tech-cn:/sequencer-env-attention
unity-tech-cn:/hh-develop-variableobs
unity-tech-cn:/exp-tanh
unity-tech-cn:/reward-dist
unity-tech-cn:/exp-weight-decay
unity-tech-cn:/exp-robot
unity-tech-cn:/bullet-hell-barracuda-test-1.3.1
unity-tech-cn:/release_13_branch
unity-tech-cn:/release_14_branch
unity-tech-cn:/exp-clipped-gaussian-entropy
unity-tech-cn:/tic-tac-toe
unity-tech-cn:/hh-develop-dodgeball
unity-tech-cn:/repro-vis-obs-perf
unity-tech-cn:/v2-staging-rebase
unity-tech-cn:/release_15_branch
unity-tech-cn:/release_15_removeendepisode
unity-tech-cn:/release_16_branch
unity-tech-cn:/release_16_fix_gridsensor
unity-tech-cn:/ai-hw-2021
unity-tech-cn:/check-for-ModelOverriders
unity-tech-cn:/fix-grid-obs-shape-init
unity-tech-cn:/fix-gym-needs-reset
unity-tech-cn:/fix-resume-imi
unity-tech-cn:/release_17_branch
unity-tech-cn:/release_17_branch_gpu_test
unity-tech-cn:/colab-links
unity-tech-cn:/exp-continuous-div
unity-tech-cn:/release_17_branch_gpu_2
unity-tech-cn:/exp-diverse-behavior
unity-tech-cn:/grid-onehot-extra-dim-empty
unity-tech-cn:/2.0-verified
unity-tech-cn:/faster-entropy-coeficient-convergence
unity-tech-cn:/pre-r18-update-changelog
unity-tech-cn:/release_18_branch
unity-tech-cn:/main/tracking
unity-tech-cn:/main/reward-providers
unity-tech-cn:/main/project-upgrade
unity-tech-cn:/main/limitation-docs
unity-tech-cn:/develop/nomaxstep-test
unity-tech-cn:/develop/tf2.0
unity-tech-cn:/develop/tanhsquash
unity-tech-cn:/develop/magic-string
unity-tech-cn:/develop/trainerinterface
unity-tech-cn:/develop/separatevalue
unity-tech-cn:/develop/nopreviousactions
unity-tech-cn:/develop/reenablerepeatactions
unity-tech-cn:/develop/0memories
unity-tech-cn:/develop/fixmemoryleak
unity-tech-cn:/develop/reducewalljump
unity-tech-cn:/develop/removeactionholder-onehot
unity-tech-cn:/develop/canonicalize-quaternions
unity-tech-cn:/develop/self-playassym
unity-tech-cn:/develop/demo-load-seek
unity-tech-cn:/develop/progress-bar
unity-tech-cn:/develop/sac-apex
unity-tech-cn:/develop/cubewars
unity-tech-cn:/develop/add-fire
unity-tech-cn:/develop/gym-wrapper
unity-tech-cn:/develop/mm-docs-main-readme
unity-tech-cn:/develop/mm-docs-overview
unity-tech-cn:/develop/no-threading
unity-tech-cn:/develop/dockerfile
unity-tech-cn:/develop/model-store
unity-tech-cn:/develop/checkout-conversion-rebase
unity-tech-cn:/develop/model-transfer
unity-tech-cn:/develop/bisim-review
unity-tech-cn:/develop/taggedobservations
unity-tech-cn:/develop/transfer-bisim
unity-tech-cn:/develop/bisim-sac-transfer
unity-tech-cn:/develop/basketball
unity-tech-cn:/develop/torchmodules
unity-tech-cn:/develop/fixmarkdown
unity-tech-cn:/develop/shortenstrikervsgoalie
unity-tech-cn:/develop/shortengoalie
unity-tech-cn:/develop/torch-save-rp
unity-tech-cn:/develop/torch-to-np
unity-tech-cn:/develop/torch-omp-no-thread
unity-tech-cn:/develop/actionmodel-csharp
unity-tech-cn:/develop/torch-extra
unity-tech-cn:/develop/restructure-torch-networks
unity-tech-cn:/develop/jit
unity-tech-cn:/develop/adjust-cpu-settings-experiment
unity-tech-cn:/develop/torch-sac-threading
unity-tech-cn:/develop/wb
unity-tech-cn:/develop/amrl
unity-tech-cn:/develop/memorydump
unity-tech-cn:/develop/permutepytorch
unity-tech-cn:/develop/sac-targetq
unity-tech-cn:/develop/actions-out
unity-tech-cn:/develop/reshapeonnxmemories
unity-tech-cn:/develop/crawlergail
unity-tech-cn:/develop/debugtorchfood
unity-tech-cn:/develop/hybrid-actions
unity-tech-cn:/develop/bullet-hell
unity-tech-cn:/develop/action-spec-gym
unity-tech-cn:/develop/battlefoodcollector
unity-tech-cn:/develop/use-action-buffers
unity-tech-cn:/develop/hardswish
unity-tech-cn:/develop/leakyrelu
unity-tech-cn:/develop/torch-clip-scale
unity-tech-cn:/develop/contentropy
unity-tech-cn:/develop/manch
unity-tech-cn:/develop/torchcrawlerdebug
unity-tech-cn:/develop/fix-nan
unity-tech-cn:/develop/multitype-buffer
unity-tech-cn:/develop/windows-delay
unity-tech-cn:/develop/torch-tanh
unity-tech-cn:/develop/gail-norm
unity-tech-cn:/develop/multiprocess
unity-tech-cn:/develop/unified-obs
unity-tech-cn:/develop/rm-rf-new-models
unity-tech-cn:/develop/skipcritic
unity-tech-cn:/develop/centralizedcritic
unity-tech-cn:/develop/dodgeball-tests
unity-tech-cn:/develop/cc-teammanager
unity-tech-cn:/develop/weight-decay
unity-tech-cn:/develop/singular-embeddings
unity-tech-cn:/develop/zombieteammanager
unity-tech-cn:/develop/superpush
unity-tech-cn:/develop/teammanager
unity-tech-cn:/develop/zombie-exp
unity-tech-cn:/develop/update-readme
unity-tech-cn:/develop/readme-fix
unity-tech-cn:/develop/coma-noact
unity-tech-cn:/develop/coma-withq
unity-tech-cn:/develop/coma2
unity-tech-cn:/develop/action-slice
unity-tech-cn:/develop/gru
unity-tech-cn:/develop/critic-op-lstm-currentmem
unity-tech-cn:/develop/decaygail
unity-tech-cn:/develop/gail-srl-hack
unity-tech-cn:/develop/rear-pad
unity-tech-cn:/develop/mm-copyright-dates
unity-tech-cn:/develop/dodgeball-raycasts
unity-tech-cn:/develop/collab-envs-exp-ervin
unity-tech-cn:/develop/pushcollabonly
unity-tech-cn:/develop/sample-curation
unity-tech-cn:/develop/soccer-groupman
unity-tech-cn:/develop/input-actuator-tanks
unity-tech-cn:/develop/validate-release-fix
unity-tech-cn:/develop/new-console-log
unity-tech-cn:/develop/lex-walker-model
unity-tech-cn:/develop/lstm-burnin
unity-tech-cn:/develop/grid-vaiable-names
unity-tech-cn:/develop/fix-attn-embedding
unity-tech-cn:/develop/api-documentation-update-some-fixes
unity-tech-cn:/develop/update-grpc
unity-tech-cn:/develop/grid-rootref-debug
unity-tech-cn:/develop/pbcollab-rays
unity-tech-cn:/develop/2.0-verified-pre
unity-tech-cn:/develop/parameterizedenvs
unity-tech-cn:/develop/custom-ray-sensor
unity-tech-cn:/develop/mm-add-v2blog
unity-tech-cn:/develop/custom-raycast
unity-tech-cn:/develop/area-manager
unity-tech-cn:/develop/remove-unecessary-lr
unity-tech-cn:/develop/use-base-env-in-learn
unity-tech-cn:/soccer-fives/multiagent
unity-tech-cn:/develop/cubewars/splashdamage
unity-tech-cn:/develop/add-fire/exp
unity-tech-cn:/develop/add-fire/jit
unity-tech-cn:/develop/add-fire/speedtest
unity-tech-cn:/develop/add-fire/bc
unity-tech-cn:/develop/add-fire/ckpt-2
unity-tech-cn:/develop/add-fire/normalize-context
unity-tech-cn:/develop/add-fire/components-dir
unity-tech-cn:/develop/add-fire/halfentropy
unity-tech-cn:/develop/add-fire/memoryclass
unity-tech-cn:/develop/add-fire/categoricaldist
unity-tech-cn:/develop/add-fire/mm
unity-tech-cn:/develop/add-fire/sac-lst
unity-tech-cn:/develop/add-fire/mm3
unity-tech-cn:/develop/add-fire/continuous
unity-tech-cn:/develop/add-fire/ghost
unity-tech-cn:/develop/add-fire/policy-tests
unity-tech-cn:/develop/add-fire/export-discrete
unity-tech-cn:/develop/add-fire/test-simple-rl-fix-resnet
unity-tech-cn:/develop/add-fire/remove-currdoc
unity-tech-cn:/develop/add-fire/clean2
unity-tech-cn:/develop/add-fire/doc-cleanups
unity-tech-cn:/develop/add-fire/changelog
unity-tech-cn:/develop/add-fire/mm2
unity-tech-cn:/develop/model-transfer/add-physics
unity-tech-cn:/develop/model-transfer/train
unity-tech-cn:/develop/jit/experiments
unity-tech-cn:/exp-vince/sep30-2020
unity-tech-cn:/hh-develop-gridsensor-tests/static
unity-tech-cn:/develop/hybrid-actions/distlist
unity-tech-cn:/develop/bullet-hell/buffer
unity-tech-cn:/goal-conditioning/new
unity-tech-cn:/goal-conditioning/sensors-2
unity-tech-cn:/goal-conditioning/sensors-3-pytest-fix
unity-tech-cn:/goal-conditioning/grid-world
unity-tech-cn:/soccer-comms/disc
unity-tech-cn:/develop/centralizedcritic/counterfact
unity-tech-cn:/develop/centralizedcritic/mm
unity-tech-cn:/develop/centralizedcritic/nonego
unity-tech-cn:/develop/zombieteammanager/disableagent
unity-tech-cn:/develop/zombieteammanager/killfirst
unity-tech-cn:/develop/superpush/int
unity-tech-cn:/develop/superpush/branch-cleanup
unity-tech-cn:/develop/teammanager/int
unity-tech-cn:/develop/teammanager/cubewar-nocycle
unity-tech-cn:/develop/teammanager/cubewars
unity-tech-cn:/develop/superpush/int/hunter
unity-tech-cn:/goal-conditioning/new/allo-crawler
unity-tech-cn:/develop/coma2/clip
unity-tech-cn:/develop/coma2/singlenetwork
unity-tech-cn:/develop/coma2/samenet
unity-tech-cn:/develop/coma2/fixgroup
unity-tech-cn:/develop/coma2/samenet/sum
unity-tech-cn:/hh-develop-dodgeball/goy-input
unity-tech-cn:/develop/soccer-groupman/mod
unity-tech-cn:/develop/soccer-groupman/mod/hunter
unity-tech-cn:/develop/soccer-groupman/mod/hunter/cine
unity-tech-cn:/ai-hw-2021/tensor-applier
此合并请求有变更与目标分支冲突。
/config/ppo/3DBall.yaml
/Project/ProjectSettings/ProjectVersion.txt
/ml-agents/mlagents/trainers/optimizer/optimizer.py
/ml-agents/mlagents/trainers/policy/policy.py
/ml-agents/mlagents/trainers/ppo/trainer.py
/ml-agents/mlagents/trainers/trainer/trainer.py
/ml-agents/mlagents/trainers/trainer/rl_trainer.py
/ml-agents/mlagents/trainers/distributions_torch.py
/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
/ml-agents/mlagents/trainers/policy/torch_policy.py
/ml-agents/mlagents/trainers/ppo/optimizer_torch.py
/ml-agents/mlagents/trainers/models.py
/ml-agents/mlagents/trainers/tests/test_ppo.py
/ml-agents/mlagents/trainers/tests/test_reward_signals.py
/ml-agents/mlagents/trainers/policy/tf_policy.py
/ml-agents/mlagents/trainers/ppo/optimizer_tf.py
/ml-agents/mlagents/trainers/ppo/optimizer_tf.py
2 次代码提交
作者 | SHA1 | 备注 | 提交日期 |
---|---|---|---|
Ervin Teng | 7830319f | Move test_env_perf file | 4 年前 |
Ervin Teng | f214836a | Changes for speed test | 5 年前 |
共有 21 个文件被更改,包括 1378 次插入 和 133 次删除
-
1config/ppo/3DBall.yaml
-
5Project/ProjectSettings/EditorBuildSettings.asset
-
2Project/ProjectSettings/ProjectVersion.txt
-
2Project/ProjectSettings/UnityConnectSettings.asset
-
4ml-agents/mlagents/trainers/models.py
-
4ml-agents/mlagents/trainers/trainer/rl_trainer.py
-
14ml-agents/mlagents/trainers/trainer/trainer.py
-
4ml-agents/mlagents/trainers/tests/test_ppo.py
-
4ml-agents/mlagents/trainers/tests/test_reward_signals.py
-
7ml-agents/mlagents/trainers/optimizer/optimizer.py
-
111ml-agents/mlagents/trainers/policy/tf_policy.py
-
128ml-agents/mlagents/trainers/policy/policy.py
-
4ml-agents/mlagents/trainers/ppo/optimizer_tf.py
-
67ml-agents/mlagents/trainers/ppo/trainer.py
-
69ml-agents/mlagents/trainers/distributions_torch.py
-
496ml-agents/mlagents/trainers/models_torch.py
-
115ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
-
300ml-agents/mlagents/trainers/policy/torch_policy.py
-
145ml-agents/mlagents/trainers/ppo/optimizer_torch.py
-
29test_env_perf.py
-
0/ml-agents/mlagents/trainers/ppo/optimizer_tf.py
|
|||
m_EditorVersion: 2018.4.17f1 |
|||
m_EditorVersion: 2018.4.20f1 |
|
|||
from abc import ABC, abstractmethod |
|||
from abc import abstractmethod |
|||
from typing import Dict, List, Optional |
|||
import numpy as np |
|||
from mlagents_envs.exception import UnityException |
|||
|
|||
class Policy(ABC): |
|||
@abstractmethod |
|||
class UnityPolicyException(UnityException): |
|||
""" |
|||
Related to errors with the Trainer. |
|||
""" |
|||
|
|||
pass |
|||
|
|||
|
|||
class Policy(object): |
|||
def __init__(self, brain, seed, trainer_params): |
|||
self.brain = brain |
|||
self.seed = seed |
|||
self.model_path = None |
|||
self.use_continuous_act = brain.vector_action_space_type == "continuous" |
|||
if self.use_continuous_act: |
|||
self.num_branches = self.brain.vector_action_space_size[0] |
|||
else: |
|||
self.num_branches = len(self.brain.vector_action_space_size) |
|||
self.previous_action_dict: Dict[str, np.array] = {} |
|||
self.memory_dict: Dict[str, np.ndarray] = {} |
|||
self.normalize = trainer_params["normalize"] |
|||
self.use_recurrent = trainer_params["use_recurrent"] |
|||
self.model_path = trainer_params["output_path"] |
|||
|
|||
if self.use_recurrent: |
|||
self.m_size = trainer_params["memory_size"] |
|||
self.sequence_length = trainer_params["sequence_length"] |
|||
if self.m_size == 0: |
|||
raise UnityPolicyException( |
|||
"The memory size for brain {0} is 0 even " |
|||
"though the trainer uses recurrent.".format(brain.brain_name) |
|||
) |
|||
elif self.m_size % 2 != 0: |
|||
raise UnityPolicyException( |
|||
"The memory size for brain {0} is {1} " |
|||
"but it must be divisible by 2.".format( |
|||
brain.brain_name, self.m_size |
|||
) |
|||
) |
|||
|
|||
def make_empty_memory(self, num_agents): |
|||
""" |
|||
Creates empty memory for use with RNNs |
|||
:param num_agents: Number of agents. |
|||
:return: Numpy array of zeros. |
|||
""" |
|||
return np.zeros((num_agents, self.m_size), dtype=np.float32) |
|||
|
|||
def save_memories( |
|||
self, agent_ids: List[str], memory_matrix: Optional[np.ndarray] |
|||
) -> None: |
|||
if memory_matrix is None: |
|||
return |
|||
for index, agent_id in enumerate(agent_ids): |
|||
self.memory_dict[agent_id] = memory_matrix[index, :] |
|||
|
|||
def retrieve_memories(self, agent_ids: List[str]) -> np.ndarray: |
|||
memory_matrix = np.zeros((len(agent_ids), self.m_size), dtype=np.float32) |
|||
for index, agent_id in enumerate(agent_ids): |
|||
if agent_id in self.memory_dict: |
|||
memory_matrix[index, :] = self.memory_dict[agent_id] |
|||
return memory_matrix |
|||
|
|||
def remove_memories(self, agent_ids): |
|||
for agent_id in agent_ids: |
|||
if agent_id in self.memory_dict: |
|||
self.memory_dict.pop(agent_id) |
|||
|
|||
def make_empty_previous_action(self, num_agents): |
|||
""" |
|||
Creates empty previous action for use with RNNs and discrete control |
|||
:param num_agents: Number of agents. |
|||
:return: Numpy array of zeros. |
|||
""" |
|||
return np.zeros((num_agents, self.num_branches), dtype=np.int) |
|||
|
|||
def save_previous_action( |
|||
self, agent_ids: List[str], action_matrix: Optional[np.ndarray] |
|||
) -> None: |
|||
if action_matrix is None: |
|||
return |
|||
for index, agent_id in enumerate(agent_ids): |
|||
self.previous_action_dict[agent_id] = action_matrix[index, :] |
|||
|
|||
def retrieve_previous_action(self, agent_ids: List[str]) -> np.ndarray: |
|||
action_matrix = np.zeros((len(agent_ids), self.num_branches), dtype=np.int) |
|||
for index, agent_id in enumerate(agent_ids): |
|||
if agent_id in self.previous_action_dict: |
|||
action_matrix[index, :] = self.previous_action_dict[agent_id] |
|||
return action_matrix |
|||
|
|||
def remove_previous_action(self, agent_ids): |
|||
for agent_id in agent_ids: |
|||
if agent_id in self.previous_action_dict: |
|||
self.previous_action_dict.pop(agent_id) |
|||
|
|||
raise NotImplementedError |
|||
|
|||
@abstractmethod |
|||
def update_normalization(self, vector_obs: np.ndarray) -> None: |
|||
pass |
|||
|
|||
@abstractmethod |
|||
def export_model(self, step=0): |
|||
pass |
|||
|
|||
@abstractmethod |
|||
def save_model(self, step=0): |
|||
pass |
|||
|
|||
@abstractmethod |
|||
def load_model(self, step=0): |
|||
pass |
|||
|
|||
@abstractmethod |
|||
def increment_step(self, n_steps): |
|||
pass |
|||
|
|||
@abstractmethod |
|||
def get_current_step(self): |
|||
pass |
|
|||
import torch |
|||
from torch import nn |
|||
from torch import distributions |
|||
import numpy as np |
|||
|
|||
EPSILON = 1e-7 # Small value to avoid divide by zero |
|||
|
|||
|
|||
class GaussianDistribution(nn.Module): |
|||
def __init__(self, hidden_size, num_outputs, conditional_sigma=False, **kwargs): |
|||
super(GaussianDistribution, self).__init__(**kwargs) |
|||
self.conditional_sigma = conditional_sigma |
|||
self.mu = nn.Linear(hidden_size, num_outputs) |
|||
nn.init.xavier_uniform_(self.mu.weight, gain=0.01) |
|||
if conditional_sigma: |
|||
self.log_sigma = nn.Linear(hidden_size, num_outputs) |
|||
nn.init.xavier_uniform(self.log_sigma.weight, gain=0.01) |
|||
else: |
|||
self.log_sigma = nn.Parameter( |
|||
torch.zeros(1, num_outputs, requires_grad=True) |
|||
) |
|||
|
|||
def forward(self, inputs): |
|||
mu = self.mu(inputs) |
|||
if self.conditional_sigma: |
|||
log_sigma = self.log_sigma(inputs) |
|||
else: |
|||
log_sigma = self.log_sigma |
|||
return [distributions.normal.Normal(loc=mu, scale=torch.exp(log_sigma))] |
|||
|
|||
|
|||
class MultiCategoricalDistribution(nn.Module): |
|||
def __init__(self, hidden_size, act_sizes): |
|||
super(MultiCategoricalDistribution, self).__init__() |
|||
self.act_sizes = act_sizes |
|||
self.branches = self.create_policy_branches(hidden_size) |
|||
|
|||
def create_policy_branches(self, hidden_size): |
|||
branches = [] |
|||
for size in self.act_sizes: |
|||
branch_output_layer = nn.Linear(hidden_size, size) |
|||
nn.init.xavier_uniform_(branch_output_layer.weight, gain=0.01) |
|||
branches.append(branch_output_layer) |
|||
return nn.ModuleList(branches) |
|||
|
|||
def mask_branch(self, logits, mask): |
|||
raw_probs = torch.nn.functional.softmax(logits, dim=-1) * mask |
|||
normalized_probs = raw_probs / torch.sum(raw_probs, dim=-1).unsqueeze(-1) |
|||
normalized_logits = torch.log(normalized_probs + EPSILON) |
|||
return normalized_logits |
|||
|
|||
def split_masks(self, masks): |
|||
split_masks = [] |
|||
for idx, _ in enumerate(self.act_sizes): |
|||
start = int(np.sum(self.act_sizes[:idx])) |
|||
end = int(np.sum(self.act_sizes[: idx + 1])) |
|||
split_masks.append(masks[:, start:end]) |
|||
return split_masks |
|||
|
|||
def forward(self, inputs, masks): |
|||
# Todo - Support multiple branches in mask code |
|||
branch_distributions = [] |
|||
masks = self.split_masks(masks) |
|||
for idx, branch in enumerate(self.branches): |
|||
logits = branch(inputs) |
|||
norm_logits = self.mask_branch(logits, masks[idx]) |
|||
distribution = distributions.categorical.Categorical(logits=norm_logits) |
|||
branch_distributions.append(distribution) |
|||
return branch_distributions |
|
|||
from enum import Enum |
|||
from typing import Callable, NamedTuple |
|||
|
|||
import torch |
|||
from torch import nn |
|||
|
|||
from mlagents.trainers.distributions_torch import ( |
|||
GaussianDistribution, |
|||
MultiCategoricalDistribution, |
|||
) |
|||
from mlagents.trainers.exception import UnityTrainerException |
|||
|
|||
ActivationFunction = Callable[[torch.Tensor], torch.Tensor] |
|||
EncoderFunction = Callable[ |
|||
[torch.Tensor, int, ActivationFunction, int, str, bool], torch.Tensor |
|||
] |
|||
|
|||
EPSILON = 1e-7 |
|||
|
|||
|
|||
class EncoderType(Enum): |
|||
SIMPLE = "simple" |
|||
NATURE_CNN = "nature_cnn" |
|||
RESNET = "resnet" |
|||
|
|||
|
|||
class ActionType(Enum): |
|||
DISCRETE = "discrete" |
|||
CONTINUOUS = "continuous" |
|||
|
|||
@staticmethod |
|||
def from_str(label): |
|||
if label in "continuous": |
|||
return ActionType.CONTINUOUS |
|||
elif label in "discrete": |
|||
return ActionType.DISCRETE |
|||
else: |
|||
raise NotImplementedError |
|||
|
|||
|
|||
class LearningRateSchedule(Enum): |
|||
CONSTANT = "constant" |
|||
LINEAR = "linear" |
|||
|
|||
|
|||
class NormalizerTensors(NamedTuple): |
|||
steps: torch.Tensor |
|||
running_mean: torch.Tensor |
|||
running_variance: torch.Tensor |
|||
|
|||
|
|||
class NetworkBody(nn.Module): |
|||
def __init__( |
|||
self, |
|||
vector_sizes, |
|||
visual_sizes, |
|||
h_size, |
|||
normalize, |
|||
num_layers, |
|||
m_size, |
|||
vis_encode_type, |
|||
use_lstm, |
|||
): |
|||
super(NetworkBody, self).__init__() |
|||
self.normalize = normalize |
|||
self.visual_encoders = [] |
|||
self.vector_encoders = [] |
|||
self.vector_normalizers = [] |
|||
self.use_lstm = use_lstm |
|||
self.h_size = h_size |
|||
self.m_size = m_size |
|||
|
|||
visual_encoder = ModelUtils.get_encoder_for_type(vis_encode_type) |
|||
for vector_size in vector_sizes: |
|||
if vector_size != 0: |
|||
self.vector_normalizers.append(Normalizer(vector_size)) |
|||
self.vector_encoders.append( |
|||
VectorEncoder(vector_size, h_size, num_layers) |
|||
) |
|||
for visual_size in visual_sizes: |
|||
self.visual_encoders.append( |
|||
visual_encoder( |
|||
visual_size.height, |
|||
visual_size.width, |
|||
visual_size.num_channels, |
|||
h_size, |
|||
) |
|||
) |
|||
|
|||
self.vector_encoders = nn.ModuleList(self.vector_encoders) |
|||
self.visual_encoders = nn.ModuleList(self.visual_encoders) |
|||
if use_lstm: |
|||
self.lstm = nn.LSTM(h_size, m_size // 2, 1) |
|||
|
|||
def update_normalization(self, vec_inputs): |
|||
if self.normalize: |
|||
for idx, vec_input in enumerate(vec_inputs): |
|||
self.vector_normalizers[idx].update(vec_input) |
|||
|
|||
def forward(self, vec_inputs, vis_inputs, memories=None, sequence_length=1): |
|||
vec_embeds = [] |
|||
for idx, encoder in enumerate(self.vector_encoders): |
|||
vec_input = vec_inputs[idx] |
|||
if self.normalize: |
|||
vec_input = self.vector_normalizers[idx](vec_input) |
|||
hidden = encoder(vec_input) |
|||
vec_embeds.append(hidden) |
|||
|
|||
vis_embeds = [] |
|||
for idx, encoder in enumerate(self.visual_encoders): |
|||
vis_input = vis_inputs[idx] |
|||
vis_input = vis_input.permute([0, 3, 1, 2]) |
|||
hidden = encoder(vis_input) |
|||
vis_embeds.append(hidden) |
|||
|
|||
if len(vec_embeds) > 0: |
|||
vec_embeds = torch.cat(vec_embeds) |
|||
if len(vis_embeds) > 0: |
|||
vis_embeds = torch.cat(vis_embeds) |
|||
if len(vec_embeds) > 0 and len(vis_embeds) > 0: |
|||
embedding = torch.cat([vec_embeds, vis_embeds]) |
|||
elif len(vec_embeds) > 0: |
|||
embedding = vec_embeds |
|||
else: |
|||
embedding = vis_embeds |
|||
|
|||
if self.use_lstm: |
|||
embedding = embedding.reshape([sequence_length, -1, self.h_size]) |
|||
memories = torch.split(memories, self.m_size // 2, dim=-1) |
|||
embedding, memories = self.lstm(embedding, memories) |
|||
embedding = embedding.reshape([-1, self.m_size // 2]) |
|||
memories = torch.cat(memories, dim=-1) |
|||
return embedding, memories |
|||
|
|||
|
|||
class ActorCritic(nn.Module): |
|||
def __init__( |
|||
self, |
|||
h_size, |
|||
vector_sizes, |
|||
visual_sizes, |
|||
act_size, |
|||
normalize, |
|||
num_layers, |
|||
m_size, |
|||
vis_encode_type, |
|||
act_type, |
|||
use_lstm, |
|||
stream_names, |
|||
separate_critic, |
|||
): |
|||
super(ActorCritic, self).__init__() |
|||
self.act_type = ActionType.from_str(act_type) |
|||
self.act_size = act_size |
|||
self.separate_critic = separate_critic |
|||
self.network_body = NetworkBody( |
|||
vector_sizes, |
|||
visual_sizes, |
|||
h_size, |
|||
normalize, |
|||
num_layers, |
|||
m_size, |
|||
vis_encode_type, |
|||
use_lstm, |
|||
) |
|||
if use_lstm: |
|||
embedding_size = m_size // 2 |
|||
else: |
|||
embedding_size = h_size |
|||
if self.act_type == ActionType.CONTINUOUS: |
|||
self.distribution = GaussianDistribution(embedding_size, act_size[0]) |
|||
else: |
|||
self.distribution = MultiCategoricalDistribution(embedding_size, act_size) |
|||
if separate_critic: |
|||
self.critic = Critic( |
|||
stream_names, |
|||
h_size, |
|||
vector_sizes, |
|||
visual_sizes, |
|||
normalize, |
|||
num_layers, |
|||
m_size, |
|||
vis_encode_type, |
|||
) |
|||
else: |
|||
self.stream_names = stream_names |
|||
self.value_heads = ValueHeads(stream_names, embedding_size) |
|||
|
|||
def update_normalization(self, vector_obs): |
|||
self.network_body.update_normalization(vector_obs) |
|||
if self.separate_critic: |
|||
self.critic.network_body.update_normalization(vector_obs) |
|||
|
|||
def critic_pass(self, vec_inputs, vis_inputs, memories=None): |
|||
if self.separate_critic: |
|||
return self.critic(vec_inputs, vis_inputs) |
|||
else: |
|||
embedding, _ = self.network_body(vec_inputs, vis_inputs, memories=memories) |
|||
return self.value_heads(embedding) |
|||
|
|||
def sample_action(self, dists): |
|||
actions = [] |
|||
for action_dist in dists: |
|||
action = action_dist.sample() |
|||
actions.append(action) |
|||
actions = torch.stack(actions, dim=-1) |
|||
return actions |
|||
|
|||
def get_probs_and_entropy(self, actions, dists): |
|||
log_probs = [] |
|||
entropies = [] |
|||
for idx, action_dist in enumerate(dists): |
|||
action = actions[..., idx] |
|||
log_probs.append(action_dist.log_prob(action)) |
|||
entropies.append(action_dist.entropy()) |
|||
log_probs = torch.stack(log_probs, dim=-1) |
|||
entropies = torch.stack(entropies, dim=-1) |
|||
if self.act_type == ActionType.CONTINUOUS: |
|||
log_probs = log_probs.squeeze(-1) |
|||
entropies = entropies.squeeze(-1) |
|||
return log_probs, entropies |
|||
|
|||
def get_dist_and_value( |
|||
self, vec_inputs, vis_inputs, masks=None, memories=None, sequence_length=1 |
|||
): |
|||
embedding, memories = self.network_body( |
|||
vec_inputs, vis_inputs, memories, sequence_length |
|||
) |
|||
if self.act_type == ActionType.CONTINUOUS: |
|||
dists = self.distribution(embedding) |
|||
else: |
|||
dists = self.distribution(embedding, masks=masks) |
|||
if self.separate_critic: |
|||
value_outputs = self.critic(vec_inputs, vis_inputs) |
|||
else: |
|||
value_outputs = self.value_heads(embedding) |
|||
return dists, value_outputs, memories |
|||
|
|||
def forward( |
|||
self, vec_inputs, vis_inputs, masks=None, memories=None, sequence_length=1 |
|||
): |
|||
dists, value_outputs, memories = self.get_dist_and_value( |
|||
vec_inputs, vis_inputs, masks, memories, sequence_length |
|||
) |
|||
sampled_actions = self.sample_action(dists) |
|||
return sampled_actions, memories |
|||
|
|||
|
|||
class Critic(nn.Module): |
|||
def __init__( |
|||
self, |
|||
stream_names, |
|||
h_size, |
|||
vector_sizes, |
|||
visual_sizes, |
|||
normalize, |
|||
num_layers, |
|||
m_size, |
|||
vis_encode_type, |
|||
): |
|||
super(Critic, self).__init__() |
|||
self.network_body = NetworkBody( |
|||
vector_sizes, |
|||
visual_sizes, |
|||
h_size, |
|||
normalize, |
|||
num_layers, |
|||
m_size, |
|||
vis_encode_type, |
|||
False, |
|||
) |
|||
self.stream_names = stream_names |
|||
self.value_heads = ValueHeads(stream_names, h_size) |
|||
|
|||
def forward(self, vec_inputs, vis_inputs): |
|||
embedding, _ = self.network_body(vec_inputs, vis_inputs) |
|||
return self.value_heads(embedding) |
|||
|
|||
|
|||
class Normalizer(nn.Module): |
|||
def __init__(self, vec_obs_size, **kwargs): |
|||
super(Normalizer, self).__init__(**kwargs) |
|||
self.normalization_steps = torch.tensor(1) |
|||
self.running_mean = torch.zeros(vec_obs_size) |
|||
self.running_variance = torch.ones(vec_obs_size) |
|||
|
|||
def forward(self, inputs): |
|||
normalized_state = torch.clamp( |
|||
(inputs - self.running_mean) |
|||
/ torch.sqrt(self.running_variance / self.normalization_steps), |
|||
-5, |
|||
5, |
|||
) |
|||
return normalized_state |
|||
|
|||
def update(self, vector_input): |
|||
steps_increment = vector_input.size()[0] |
|||
total_new_steps = self.normalization_steps + steps_increment |
|||
|
|||
input_to_old_mean = vector_input - self.running_mean |
|||
new_mean = self.running_mean + (input_to_old_mean / total_new_steps).sum(0) |
|||
|
|||
input_to_new_mean = vector_input - new_mean |
|||
new_variance = self.running_variance + ( |
|||
input_to_new_mean * input_to_old_mean |
|||
).sum(0) |
|||
self.running_mean = new_mean |
|||
self.running_variance = new_variance |
|||
self.normalization_steps = total_new_steps |
|||
|
|||
|
|||
class ValueHeads(nn.Module): |
|||
def __init__(self, stream_names, input_size): |
|||
super(ValueHeads, self).__init__() |
|||
self.stream_names = stream_names |
|||
self.value_heads = {} |
|||
|
|||
for name in stream_names: |
|||
value = nn.Linear(input_size, 1) |
|||
self.value_heads[name] = value |
|||
self.value_heads = nn.ModuleDict(self.value_heads) |
|||
|
|||
def forward(self, hidden): |
|||
value_outputs = {} |
|||
for stream_name, _ in self.value_heads.items(): |
|||
value_outputs[stream_name] = self.value_heads[stream_name](hidden).squeeze( |
|||
-1 |
|||
) |
|||
return ( |
|||
value_outputs, |
|||
torch.mean(torch.stack(list(value_outputs.values())), dim=0), |
|||
) |
|||
|
|||
|
|||
class VectorEncoder(nn.Module): |
|||
def __init__(self, input_size, hidden_size, num_layers, **kwargs): |
|||
super(VectorEncoder, self).__init__(**kwargs) |
|||
self.layers = [nn.Linear(input_size, hidden_size)] |
|||
for _ in range(num_layers - 1): |
|||
self.layers.append(nn.Linear(hidden_size, hidden_size)) |
|||
self.layers.append(nn.ReLU()) |
|||
self.layers = nn.ModuleList(self.layers) |
|||
|
|||
def forward(self, inputs): |
|||
x = inputs |
|||
for layer in self.layers: |
|||
x = layer(x) |
|||
return x |
|||
|
|||
|
|||
def conv_output_shape(h_w, kernel_size=1, stride=1, pad=0, dilation=1): |
|||
from math import floor |
|||
|
|||
if type(kernel_size) is not tuple: |
|||
kernel_size = (kernel_size, kernel_size) |
|||
h = floor( |
|||
((h_w[0] + (2 * pad) - (dilation * (kernel_size[0] - 1)) - 1) / stride) + 1 |
|||
) |
|||
w = floor( |
|||
((h_w[1] + (2 * pad) - (dilation * (kernel_size[1] - 1)) - 1) / stride) + 1 |
|||
) |
|||
return h, w |
|||
|
|||
|
|||
class SimpleVisualEncoder(nn.Module): |
|||
def __init__(self, height, width, initial_channels, output_size): |
|||
super(SimpleVisualEncoder, self).__init__() |
|||
self.h_size = output_size |
|||
conv_1_hw = conv_output_shape((height, width), 8, 4) |
|||
conv_2_hw = conv_output_shape(conv_1_hw, 4, 2) |
|||
self.final_flat = conv_2_hw[0] * conv_2_hw[1] * 32 |
|||
|
|||
self.conv1 = nn.Conv2d(initial_channels, 16, [8, 8], [4, 4]) |
|||
self.conv2 = nn.Conv2d(16, 32, [4, 4], [2, 2]) |
|||
self.dense = nn.Linear(self.final_flat, self.h_size) |
|||
|
|||
def forward(self, visual_obs): |
|||
conv_1 = torch.relu(self.conv1(visual_obs)) |
|||
conv_2 = torch.relu(self.conv2(conv_1)) |
|||
hidden = self.dense(conv_2.reshape([-1, self.final_flat])) |
|||
return hidden |
|||
|
|||
|
|||
class NatureVisualEncoder(nn.Module): |
|||
def __init__(self, height, width, initial_channels, output_size): |
|||
super(NatureVisualEncoder, self).__init__() |
|||
self.h_size = output_size |
|||
conv_1_hw = conv_output_shape((height, width), 8, 4) |
|||
conv_2_hw = conv_output_shape(conv_1_hw, 4, 2) |
|||
conv_3_hw = conv_output_shape(conv_2_hw, 3, 1) |
|||
self.final_flat = conv_3_hw[0] * conv_3_hw[1] * 64 |
|||
|
|||
self.conv1 = nn.Conv2d(initial_channels, 32, [8, 8], [4, 4]) |
|||
self.conv2 = nn.Conv2d(43, 64, [4, 4], [2, 2]) |
|||
self.conv3 = nn.Conv2d(64, 64, [3, 3], [1, 1]) |
|||
self.dense = nn.Linear(self.final_flat, self.h_size) |
|||
|
|||
def forward(self, visual_obs): |
|||
conv_1 = torch.relu(self.conv1(visual_obs)) |
|||
conv_2 = torch.relu(self.conv2(conv_1)) |
|||
conv_3 = torch.relu(self.conv3(conv_2)) |
|||
hidden = self.dense(conv_3.reshape([-1, self.final_flat])) |
|||
return hidden |
|||
|
|||
|
|||
class GlobalSteps(nn.Module): |
|||
def __init__(self): |
|||
super(GlobalSteps, self).__init__() |
|||
self.global_step = torch.Tensor([0]) |
|||
|
|||
def increment(self, value): |
|||
self.global_step += value |
|||
|
|||
|
|||
class LearningRate(nn.Module): |
|||
def __init__(self, lr): |
|||
# Todo: add learning rate decay |
|||
super(LearningRate, self).__init__() |
|||
self.learning_rate = torch.Tensor([lr]) |
|||
|
|||
|
|||
class ResNetVisualEncoder(nn.Module): |
|||
def __init__(self, initial_channels): |
|||
super(ResNetVisualEncoder, self).__init__() |
|||
n_channels = [16, 32, 32] # channel for each stack |
|||
n_blocks = 2 # number of residual blocks |
|||
self.layers = [] |
|||
for _, channel in enumerate(n_channels): |
|||
self.layers.append(nn.Conv2d(initial_channels, channel, [3, 3], [1, 1])) |
|||
self.layers.append(nn.MaxPool2d([3, 3], [2, 2])) |
|||
for _ in range(n_blocks): |
|||
self.layers.append(self.make_block(channel)) |
|||
self.layers.append(nn.ReLU()) |
|||
|
|||
@staticmethod |
|||
def make_block(channel): |
|||
block_layers = [ |
|||
nn.ReLU(), |
|||
nn.Conv2d(channel, channel, [3, 3], [1, 1]), |
|||
nn.ReLU(), |
|||
nn.Conv2d(channel, channel, [3, 3], [1, 1]), |
|||
] |
|||
return block_layers |
|||
|
|||
@staticmethod |
|||
def forward_block(input_hidden, block_layers): |
|||
hidden = input_hidden |
|||
for layer in block_layers: |
|||
hidden = layer(hidden) |
|||
return hidden + input_hidden |
|||
|
|||
def forward(self, visual_obs): |
|||
hidden = visual_obs |
|||
for layer in self.layers: |
|||
if layer is nn.Module: |
|||
hidden = layer(hidden) |
|||
elif layer is list: |
|||
hidden = self.forward_block(hidden, layer) |
|||
return hidden.flatten() |
|||
|
|||
|
|||
class ModelUtils: |
|||
# Minimum supported side for each encoder type. If refactoring an encoder, please |
|||
# adjust these also. |
|||
MIN_RESOLUTION_FOR_ENCODER = { |
|||
EncoderType.SIMPLE: 20, |
|||
EncoderType.NATURE_CNN: 36, |
|||
EncoderType.RESNET: 15, |
|||
} |
|||
|
|||
@staticmethod |
|||
def swish(input_activation: torch.Tensor) -> torch.Tensor: |
|||
"""Swish activation function. For more info: https://arxiv.org/abs/1710.05941""" |
|||
return torch.mul(input_activation, torch.sigmoid(input_activation)) |
|||
|
|||
@staticmethod |
|||
def get_encoder_for_type(encoder_type: EncoderType) -> nn.Module: |
|||
ENCODER_FUNCTION_BY_TYPE = { |
|||
EncoderType.SIMPLE: SimpleVisualEncoder, |
|||
EncoderType.NATURE_CNN: NatureVisualEncoder, |
|||
EncoderType.RESNET: ResNetVisualEncoder, |
|||
} |
|||
return ENCODER_FUNCTION_BY_TYPE.get(encoder_type) |
|||
|
|||
@staticmethod |
|||
def _check_resolution_for_encoder( |
|||
vis_in: torch.Tensor, vis_encoder_type: EncoderType |
|||
) -> None: |
|||
min_res = ModelUtils.MIN_RESOLUTION_FOR_ENCODER[vis_encoder_type] |
|||
height = vis_in.shape[1] |
|||
width = vis_in.shape[2] |
|||
if height < min_res or width < min_res: |
|||
raise UnityTrainerException( |
|||
f"Visual observation resolution ({width}x{height}) is too small for" |
|||
f"the provided EncoderType ({vis_encoder_type.value}). The min dimension is {min_res}" |
|||
) |
|
|||
from typing import Dict, Any, Optional, Tuple, List |
|||
import torch |
|||
import numpy as np |
|||
from mlagents_envs.base_env import DecisionSteps |
|||
|
|||
from mlagents.trainers.buffer import AgentBuffer |
|||
from mlagents.trainers.components.bc.module import BCModule |
|||
from mlagents.trainers.components.reward_signals.extrinsic.signal import ( |
|||
ExtrinsicRewardSignal, |
|||
) |
|||
from mlagents.trainers.policy.torch_policy import TorchPolicy |
|||
from mlagents.trainers.optimizer import Optimizer |
|||
from mlagents.trainers.trajectory import SplitObservations |
|||
|
|||
|
|||
class TorchOptimizer(Optimizer): # pylint: disable=W0223 |
|||
def __init__(self, policy: TorchPolicy, trainer_params: Dict[str, Any]): |
|||
super(TorchOptimizer, self).__init__() |
|||
self.policy = policy |
|||
self.trainer_params = trainer_params |
|||
self.update_dict: Dict[str, torch.Tensor] = {} |
|||
self.value_heads: Dict[str, torch.Tensor] = {} |
|||
self.memory_in: torch.Tensor = None |
|||
self.memory_out: torch.Tensor = None |
|||
self.m_size: int = 0 |
|||
self.global_step = torch.tensor(0) |
|||
self.bc_module: Optional[BCModule] = None |
|||
self.create_reward_signals(trainer_params["reward_signals"]) |
|||
|
|||
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: |
|||
pass |
|||
|
|||
def create_reward_signals(self, reward_signal_configs): |
|||
""" |
|||
Create reward signals |
|||
:param reward_signal_configs: Reward signal config. |
|||
""" |
|||
extrinsic_signal = ExtrinsicRewardSignal( |
|||
self.policy, **reward_signal_configs["extrinsic"] |
|||
) |
|||
self.reward_signals = {"extrinsic": extrinsic_signal} |
|||
# Create reward signals |
|||
# for reward_signal, config in reward_signal_configs.items(): |
|||
# self.reward_signals[reward_signal] = create_reward_signal( |
|||
# self.policy, reward_signal, config |
|||
# ) |
|||
# self.update_dict.update(self.reward_signals[reward_signal].update_dict) |
|||
|
|||
def get_value_estimates( |
|||
self, decision_requests: DecisionSteps, idx: int, done: bool |
|||
) -> Dict[str, float]: |
|||
""" |
|||
Generates value estimates for bootstrapping. |
|||
:param decision_requests: |
|||
:param idx: Index in BrainInfo of agent. |
|||
:param done: Whether or not this is the last element of the episode, |
|||
in which case the value estimate will be 0. |
|||
:return: The value estimate dictionary with key being the name of the reward signal |
|||
and the value the corresponding value estimate. |
|||
""" |
|||
vec_vis_obs = SplitObservations.from_observations(decision_requests.obs) |
|||
|
|||
value_estimates, mean_value = self.policy.actor_critic.critic_pass( |
|||
np.expand_dims(vec_vis_obs.vector_observations[idx], 0), |
|||
np.expand_dims(vec_vis_obs.visual_observations[idx], 0), |
|||
) |
|||
|
|||
value_estimates = {k: float(v) for k, v in value_estimates.items()} |
|||
|
|||
# If we're done, reassign all of the value estimates that need terminal states. |
|||
if done: |
|||
for k in value_estimates: |
|||
if self.reward_signals[k].use_terminal_states: |
|||
value_estimates[k] = 0.0 |
|||
|
|||
return value_estimates |
|||
|
|||
def get_trajectory_value_estimates( |
|||
self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool |
|||
) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]: |
|||
vector_obs = [torch.as_tensor(batch["vector_obs"])] |
|||
if self.policy.use_vis_obs: |
|||
visual_obs = [] |
|||
for idx, _ in enumerate( |
|||
self.policy.actor_critic.network_body.visual_encoders |
|||
): |
|||
visual_ob = torch.as_tensor(batch["visual_obs%d" % idx]) |
|||
visual_obs.append(visual_ob) |
|||
else: |
|||
visual_obs = [] |
|||
|
|||
memory = torch.zeros([1, len(vector_obs[0]), self.policy.m_size]) |
|||
|
|||
next_obs = np.concatenate(next_obs, axis=-1) |
|||
next_obs = [torch.as_tensor(next_obs).unsqueeze(0)] |
|||
next_memory = torch.zeros([1, 1, self.policy.m_size]) |
|||
|
|||
value_estimates, mean_value = self.policy.actor_critic.critic_pass( |
|||
vector_obs, visual_obs, memory |
|||
) |
|||
|
|||
next_value_estimate, next_value = self.policy.actor_critic.critic_pass( |
|||
next_obs, next_obs, next_memory |
|||
) |
|||
|
|||
for name, estimate in value_estimates.items(): |
|||
value_estimates[name] = estimate.detach().numpy() |
|||
next_value_estimate[name] = next_value_estimate[name].detach().numpy() |
|||
|
|||
if done: |
|||
for k in next_value_estimate: |
|||
if self.reward_signals[k].use_terminal_states: |
|||
next_value_estimate[k] = 0.0 |
|||
|
|||
return value_estimates, next_value_estimate |
|
|||
from typing import Any, Dict, List |
|||
import numpy as np |
|||
import torch |
|||
import os |
|||
from torch import onnx |
|||
from mlagents.trainers.action_info import ActionInfo |
|||
from mlagents.trainers.brain_conversion_utils import get_global_agent_id |
|||
|
|||
from mlagents.trainers.policy import Policy |
|||
from mlagents_envs.base_env import DecisionSteps |
|||
from mlagents.tf_utils import tf |
|||
from mlagents_envs.timers import timed |
|||
|
|||
from mlagents.trainers.policy.policy import UnityPolicyException |
|||
from mlagents.trainers.trajectory import SplitObservations |
|||
from mlagents.trainers.brain import BrainParameters |
|||
from mlagents.trainers.models_torch import EncoderType, ActorCritic |
|||
|
|||
EPSILON = 1e-7 # Small value to avoid divide by zero |
|||
|
|||
torch.set_num_threads(1) |
|||
|
|||
|
|||
class TorchPolicy(Policy): |
|||
def __init__( |
|||
self, |
|||
seed: int, |
|||
brain: BrainParameters, |
|||
trainer_params: Dict[str, Any], |
|||
load: bool, |
|||
tanh_squash: bool = False, |
|||
reparameterize: bool = False, |
|||
condition_sigma_on_obs: bool = True, |
|||
): |
|||
""" |
|||
Policy that uses a multilayer perceptron to map the observations to actions. Could |
|||
also use a CNN to encode visual input prior to the MLP. Supports discrete and |
|||
continuous action spaces, as well as recurrent networks. |
|||
:param seed: Random seed. |
|||
:param brain: Assigned BrainParameters object. |
|||
:param trainer_params: Defined training parameters. |
|||
:param load: Whether a pre-trained model will be loaded or a new one created. |
|||
:param tanh_squash: Whether to use a tanh function on the continuous output, |
|||
or a clipped output. |
|||
:param reparameterize: Whether we are using the resampling trick to update the policy |
|||
in continuous output. |
|||
""" |
|||
super(TorchPolicy, self).__init__(brain, seed, trainer_params) |
|||
self.grads = None |
|||
num_layers = trainer_params["num_layers"] |
|||
self.h_size = trainer_params["hidden_units"] |
|||
self.seed = seed |
|||
self.brain = brain |
|||
self.global_step = 0 |
|||
self.m_size = 0 |
|||
|
|||
self.act_size = brain.vector_action_space_size |
|||
self.act_type = brain.vector_action_space_type |
|||
self.sequence_length = 1 |
|||
if self.use_recurrent: |
|||
self.m_size = trainer_params["memory_size"] |
|||
self.sequence_length = trainer_params["sequence_length"] |
|||
if self.m_size == 0: |
|||
raise UnityPolicyException( |
|||
"The memory size for brain {0} is 0 even " |
|||
"though the trainer uses recurrent.".format(brain.brain_name) |
|||
) |
|||
elif self.m_size % 2 != 0: |
|||
raise UnityPolicyException( |
|||
"The memory size for brain {0} is {1} " |
|||
"but it must be divisible by 2.".format( |
|||
brain.brain_name, self.m_size |
|||
) |
|||
) |
|||
|
|||
if num_layers < 1: |
|||
num_layers = 1 |
|||
self.num_layers = num_layers |
|||
self.vis_encode_type = EncoderType( |
|||
trainer_params.get("vis_encode_type", "simple") |
|||
) |
|||
self.tanh_squash = tanh_squash |
|||
self.reparameterize = reparameterize |
|||
self.condition_sigma_on_obs = condition_sigma_on_obs |
|||
|
|||
# Non-exposed parameters; these aren't exposed because they don't have a |
|||
# good explanation and usually shouldn't be touched. |
|||
self.log_std_min = -20 |
|||
self.log_std_max = 2 |
|||
|
|||
self.inference_dict: Dict[str, tf.Tensor] = {} |
|||
self.update_dict: Dict[str, tf.Tensor] = {} |
|||
# TF defaults to 32-bit, so we use the same here. |
|||
torch.set_default_tensor_type(torch.FloatTensor) |
|||
|
|||
reward_signal_configs = trainer_params["reward_signals"] |
|||
self.stats_name_to_update_name = { |
|||
"Losses/Value Loss": "value_loss", |
|||
"Losses/Policy Loss": "policy_loss", |
|||
} |
|||
|
|||
self.actor_critic = ActorCritic( |
|||
h_size=int(trainer_params["hidden_units"]), |
|||
act_type=self.act_type, |
|||
vector_sizes=[brain.vector_observation_space_size], |
|||
act_size=brain.vector_action_space_size, |
|||
normalize=trainer_params["normalize"], |
|||
num_layers=int(trainer_params["num_layers"]), |
|||
m_size=trainer_params["memory_size"], |
|||
use_lstm=self.use_recurrent, |
|||
visual_sizes=brain.camera_resolutions, |
|||
vis_encode_type=EncoderType( |
|||
trainer_params.get("vis_encode_type", "simple") |
|||
), |
|||
stream_names=list(reward_signal_configs.keys()), |
|||
separate_critic=self.use_continuous_act, |
|||
) |
|||
|
|||
def split_decision_step(self, decision_requests): |
|||
vec_vis_obs = SplitObservations.from_observations(decision_requests.obs) |
|||
mask = None |
|||
if not self.use_continuous_act: |
|||
mask = torch.ones( |
|||
[len(decision_requests), np.sum(self.brain.vector_action_space_size)] |
|||
) |
|||
if decision_requests.action_mask is not None: |
|||
mask = torch.as_tensor( |
|||
1 - np.concatenate(decision_requests.action_mask, axis=1) |
|||
) |
|||
return vec_vis_obs.vector_observations, vec_vis_obs.visual_observations, mask |
|||
|
|||
def update_normalization(self, vector_obs: np.ndarray) -> None: |
|||
""" |
|||
If this policy normalizes vector observations, this will update the norm values in the graph. |
|||
:param vector_obs: The vector observations to add to the running estimate of the distribution. |
|||
""" |
|||
vector_obs = [torch.as_tensor(vector_obs)] |
|||
if self.use_vec_obs and self.normalize: |
|||
self.actor_critic.update_normalization(vector_obs) |
|||
|
|||
@timed |
|||
def sample_actions(self, vec_obs, vis_obs, masks=None, memories=None, seq_len=1): |
|||
dists, ( |
|||
value_heads, |
|||
mean_value, |
|||
), memories = self.actor_critic.get_dist_and_value( |
|||
vec_obs, vis_obs, masks, memories, seq_len |
|||
) |
|||
|
|||
actions = self.actor_critic.sample_action(dists) |
|||
log_probs, entropies = self.actor_critic.get_probs_and_entropy(actions, dists) |
|||
if self.act_type == "continuous": |
|||
actions.squeeze_(-1) |
|||
|
|||
return actions, log_probs, entropies, value_heads, memories |
|||
|
|||
def evaluate_actions( |
|||
self, vec_obs, vis_obs, actions, masks=None, memories=None, seq_len=1 |
|||
): |
|||
dists, (value_heads, mean_value), _ = self.actor_critic.get_dist_and_value( |
|||
vec_obs, vis_obs, masks, memories, seq_len |
|||
) |
|||
|
|||
log_probs, entropies = self.actor_critic.get_probs_and_entropy(actions, dists) |
|||
|
|||
return log_probs, entropies, value_heads |
|||
|
|||
@timed |
|||
def evaluate( |
|||
self, decision_requests: DecisionSteps, global_agent_ids: List[str] |
|||
) -> Dict[str, Any]: |
|||
""" |
|||
Evaluates policy for the agent experiences provided. |
|||
:param global_agent_ids: |
|||
:param decision_requests: DecisionStep object containing inputs. |
|||
:return: Outputs from network as defined by self.inference_dict. |
|||
""" |
|||
vec_obs, vis_obs, masks = self.split_decision_step(decision_requests) |
|||
vec_obs = [torch.as_tensor(vec_obs)] |
|||
vis_obs = [torch.as_tensor(vis_ob) for vis_ob in vis_obs] |
|||
memories = torch.as_tensor(self.retrieve_memories(global_agent_ids)).unsqueeze( |
|||
0 |
|||
) |
|||
|
|||
run_out = {} |
|||
with torch.no_grad(): |
|||
action, log_probs, entropy, value_heads, memories = self.sample_actions( |
|||
vec_obs, vis_obs, masks=masks, memories=memories |
|||
) |
|||
run_out["action"] = action.detach().numpy() |
|||
run_out["pre_action"] = action.detach().numpy() |
|||
# Todo - make pre_action difference |
|||
run_out["log_probs"] = log_probs.detach().numpy() |
|||
run_out["entropy"] = entropy.detach().numpy() |
|||
run_out["value_heads"] = { |
|||
name: t.detach().numpy() for name, t in value_heads.items() |
|||
} |
|||
run_out["value"] = np.mean(list(run_out["value_heads"].values()), 0) |
|||
run_out["learning_rate"] = 0.0 |
|||
if self.use_recurrent: |
|||
run_out["memories"] = memories.detach().numpy() |
|||
self.actor_critic.update_normalization(vec_obs) |
|||
return run_out |
|||
|
|||
def get_action( |
|||
self, decision_requests: DecisionSteps, worker_id: int = 0 |
|||
) -> ActionInfo: |
|||
""" |
|||
Decides actions given observations information, and takes them in environment. |
|||
:param worker_id: |
|||
:param decision_requests: A dictionary of brain names and BrainInfo from environment. |
|||
:return: an ActionInfo containing action, memories, values and an object |
|||
to be passed to add experiences |
|||
""" |
|||
if len(decision_requests) == 0: |
|||
return ActionInfo.empty() |
|||
|
|||
global_agent_ids = [ |
|||
get_global_agent_id(worker_id, int(agent_id)) |
|||
for agent_id in decision_requests.agent_id |
|||
] # For 1-D array, the iterator order is correct. |
|||
|
|||
run_out = self.evaluate( |
|||
decision_requests, global_agent_ids |
|||
) # pylint: disable=assignment-from-no-return |
|||
self.save_memories(global_agent_ids, run_out.get("memory_out")) |
|||
return ActionInfo( |
|||
action=run_out.get("action"), |
|||
value=run_out.get("value"), |
|||
outputs=run_out, |
|||
agent_ids=list(decision_requests.agent_id), |
|||
) |
|||
|
|||
def save_model(self, step=0): |
|||
""" |
|||
Saves the model |
|||
:param step: The number of steps the model was trained for |
|||
""" |
|||
if not os.path.exists(self.model_path): |
|||
os.makedirs(self.model_path) |
|||
save_path = self.model_path + "/model-" + str(step) + ".pt" |
|||
torch.save(self.actor_critic.state_dict(), save_path) |
|||
|
|||
def load_model(self, step=0): |
|||
load_path = self.model_path + "/model-" + str(step) + ".pt" |
|||
self.actor_critic.load_state_dict(torch.load(load_path)) |
|||
|
|||
def export_model(self, step=0): |
|||
fake_vec_obs = [torch.zeros([1] + [self.vec_obs_size])] |
|||
fake_vis_obs = [ |
|||
torch.zeros( |
|||
[1] + [camera_res.height, camera_res.width, camera_res.num_channels] |
|||
) |
|||
for camera_res in self.brain.camera_resolutions |
|||
] |
|||
if self.use_continuous_act: |
|||
fake_masks = None |
|||
else: |
|||
fake_masks = torch.ones([1] + [int(np.sum(self.act_size))]) |
|||
fake_memories = torch.zeros([1] + [self.m_size]) |
|||
export_path = self.model_path + "/model-" + str(step) + ".onnx" |
|||
output_names = ["action", "value_estimates", "memories"] |
|||
onnx.export( |
|||
self.actor_critic, |
|||
(fake_vec_obs, fake_vis_obs, fake_masks, fake_memories, 1), |
|||
export_path, |
|||
verbose=True, |
|||
output_names=output_names, |
|||
) |
|||
|
|||
@property |
|||
def vis_obs_size(self): |
|||
return self.brain.number_visual_observations |
|||
|
|||
@property |
|||
def vec_obs_size(self): |
|||
return self.brain.vector_observation_space_size |
|||
|
|||
@property |
|||
def use_vis_obs(self): |
|||
return self.vis_obs_size > 0 |
|||
|
|||
@property |
|||
def use_vec_obs(self): |
|||
return self.vec_obs_size > 0 |
|||
|
|||
def get_current_step(self): |
|||
""" |
|||
Gets current model step. |
|||
:return: current model step. |
|||
""" |
|||
step = self.global_step |
|||
return step |
|||
|
|||
def increment_step(self, n_steps): |
|||
""" |
|||
Increments model step. |
|||
""" |
|||
self.global_step += n_steps |
|||
return self.get_current_step() |
|
|||
from typing import Any, Dict |
|||
import torch |
|||
|
|||
from mlagents.trainers.buffer import AgentBuffer |
|||
|
|||
from mlagents_envs.timers import timed |
|||
from mlagents.trainers.policy.torch_policy import TorchPolicy |
|||
from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer |
|||
|
|||
|
|||
class TorchPPOOptimizer(TorchOptimizer): |
|||
def __init__(self, policy: TorchPolicy, trainer_params: Dict[str, Any]): |
|||
""" |
|||
Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy. |
|||
The PPO optimizer has a value estimator and a loss function. |
|||
:param policy: A TFPolicy object that will be updated by this PPO Optimizer. |
|||
:param trainer_params: Trainer parameters dictionary that specifies the |
|||
properties of the trainer. |
|||
""" |
|||
# Create the graph here to give more granular control of the TF graph to the Optimizer. |
|||
|
|||
super(TorchPPOOptimizer, self).__init__(policy, trainer_params) |
|||
params = list(self.policy.actor_critic.parameters()) |
|||
|
|||
self.optimizer = torch.optim.Adam( |
|||
params, lr=self.trainer_params["learning_rate"] |
|||
) |
|||
self.stats_name_to_update_name = { |
|||
"Losses/Value Loss": "value_loss", |
|||
"Losses/Policy Loss": "policy_loss", |
|||
} |
|||
|
|||
self.stream_names = list(self.reward_signals.keys()) |
|||
|
|||
def ppo_value_loss(self, values, old_values, returns): |
|||
""" |
|||
Creates training-specific Tensorflow ops for PPO models. |
|||
:param returns: |
|||
:param old_values: |
|||
:param values: |
|||
""" |
|||
|
|||
decay_epsilon = self.trainer_params["epsilon"] |
|||
|
|||
value_losses = [] |
|||
for name, head in values.items(): |
|||
old_val_tensor = old_values[name] |
|||
returns_tensor = returns[name] |
|||
clipped_value_estimate = old_val_tensor + torch.clamp( |
|||
head - old_val_tensor, -decay_epsilon, decay_epsilon |
|||
) |
|||
v_opt_a = (returns_tensor - head) ** 2 |
|||
v_opt_b = (returns_tensor - clipped_value_estimate) ** 2 |
|||
value_loss = torch.mean(torch.max(v_opt_a, v_opt_b)) |
|||
value_losses.append(value_loss) |
|||
value_loss = torch.mean(torch.stack(value_losses)) |
|||
return value_loss |
|||
|
|||
def ppo_policy_loss(self, advantages, log_probs, old_log_probs, masks): |
|||
""" |
|||
Creates training-specific Tensorflow ops for PPO models. |
|||
:param masks: |
|||
:param advantages: |
|||
:param log_probs: Current policy probabilities |
|||
:param old_log_probs: Past policy probabilities |
|||
""" |
|||
advantage = advantages.unsqueeze(-1) |
|||
|
|||
decay_epsilon = self.trainer_params["epsilon"] |
|||
|
|||
r_theta = torch.exp(log_probs - old_log_probs) |
|||
p_opt_a = r_theta * advantage |
|||
p_opt_b = ( |
|||
torch.clamp(r_theta, 1.0 - decay_epsilon, 1.0 + decay_epsilon) * advantage |
|||
) |
|||
policy_loss = -torch.mean(torch.min(p_opt_a, p_opt_b)) |
|||
return policy_loss |
|||
|
|||
@timed |
|||
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: |
|||
""" |
|||
Performs update on model. |
|||
:param batch: Batch of experiences. |
|||
:param num_sequences: Number of sequences to process. |
|||
:return: Results of update. |
|||
""" |
|||
returns = {} |
|||
old_values = {} |
|||
for name in self.reward_signals: |
|||
old_values[name] = torch.as_tensor(batch["{}_value_estimates".format(name)]) |
|||
returns[name] = torch.as_tensor(batch["{}_returns".format(name)]) |
|||
|
|||
vec_obs = [torch.as_tensor(batch["vector_obs"])] |
|||
act_masks = torch.as_tensor(batch["action_mask"]) |
|||
if self.policy.use_continuous_act: |
|||
actions = torch.as_tensor(batch["actions"]).unsqueeze(-1) |
|||
else: |
|||
actions = torch.as_tensor(batch["actions"]) |
|||
|
|||
memories = [ |
|||
torch.as_tensor(batch["memory"][i]) |
|||
for i in range(0, len(batch["memory"]), self.policy.sequence_length) |
|||
] |
|||
if len(memories) > 0: |
|||
memories = torch.stack(memories).unsqueeze(0) |
|||
|
|||
if self.policy.use_vis_obs: |
|||
vis_obs = [] |
|||
for idx, _ in enumerate( |
|||
self.policy.actor_critic.network_body.visual_encoders |
|||
): |
|||
vis_ob = torch.as_tensor(batch["visual_obs%d" % idx]) |
|||
vis_obs.append(vis_ob) |
|||
else: |
|||
vis_obs = [] |
|||
log_probs, entropy, values = self.policy.evaluate_actions( |
|||
vec_obs, |
|||
vis_obs, |
|||
masks=act_masks, |
|||
actions=actions, |
|||
memories=memories, |
|||
seq_len=self.policy.sequence_length, |
|||
) |
|||
value_loss = self.ppo_value_loss(values, old_values, returns) |
|||
policy_loss = self.ppo_policy_loss( |
|||
torch.as_tensor(batch["advantages"]), |
|||
log_probs, |
|||
torch.as_tensor(batch["action_probs"]), |
|||
torch.as_tensor(batch["masks"], dtype=torch.int32), |
|||
) |
|||
loss = ( |
|||
policy_loss |
|||
+ 0.5 * value_loss |
|||
- self.trainer_params["beta"] * torch.mean(entropy) |
|||
) |
|||
self.optimizer.zero_grad() |
|||
loss.backward() |
|||
|
|||
self.optimizer.step() |
|||
update_stats = { |
|||
"Losses/Policy Loss": abs(policy_loss.detach().numpy()), |
|||
"Losses/Value Loss": value_loss.detach().numpy(), |
|||
} |
|||
|
|||
return update_stats |
|
|||
from mlagents.trainers.tests.test_simple_rl import ( |
|||
_check_environment_trains, |
|||
PPO_CONFIG, |
|||
generate_config, |
|||
) |
|||
from mlagents.trainers.tests.simple_test_envs import SimpleEnvironment |
|||
from mlagents.trainers.ppo.trainer import TIMINGS |
|||
import matplotlib.pyplot as plt |
|||
import numpy as np |
|||
|
|||
BRAIN_NAME = "1D" |
|||
|
|||
if __name__ == "__main__": |
|||
env = SimpleEnvironment([BRAIN_NAME], use_discrete=False) |
|||
config = generate_config( |
|||
PPO_CONFIG, |
|||
override_vals={"batch_size": 256, "max_steps": 20000, "buffer_size": 1024}, |
|||
) |
|||
try: |
|||
_check_environment_trains(env, config) |
|||
except Exception: |
|||
pass |
|||
print(f"Mean update time {np.mean(TIMINGS)}") |
|||
plt.plot(TIMINGS) |
|||
plt.ylim((0, 0.006)) |
|||
plt.title("PyTorch w/ 3DBall Running, batch size 256, 32 hidden units, 1 layer") |
|||
plt.ylabel("Update Time (s)") |
|||
plt.ylabel("Update #") |
|||
plt.show() |
撰写
预览
正在加载...
取消
保存
Reference in new issue