比较提交
合并到: unity-tech-cn:main
unity-tech-cn:/main
unity-tech-cn:/develop-generalizationTraining-TrainerController
unity-tech-cn:/tag-0.2.0
unity-tech-cn:/tag-0.2.1
unity-tech-cn:/tag-0.2.1a
unity-tech-cn:/tag-0.2.1c
unity-tech-cn:/tag-0.2.1d
unity-tech-cn:/hotfix-v0.9.2a
unity-tech-cn:/develop-gpu-test
unity-tech-cn:/0.10.1
unity-tech-cn:/develop-pyinstaller
unity-tech-cn:/develop-horovod
unity-tech-cn:/PhysXArticulations20201
unity-tech-cn:/importdocfix
unity-tech-cn:/develop-resizetexture
unity-tech-cn:/hh-develop-walljump_bugfixes
unity-tech-cn:/develop-walljump-fix-sac
unity-tech-cn:/hh-develop-walljump_rnd
unity-tech-cn:/tag-0.11.0.dev0
unity-tech-cn:/develop-pytorch
unity-tech-cn:/tag-0.11.0.dev2
unity-tech-cn:/develop-newnormalization
unity-tech-cn:/tag-0.11.0.dev3
unity-tech-cn:/develop
unity-tech-cn:/release-0.12.0
unity-tech-cn:/tag-0.12.0-dev
unity-tech-cn:/tag-0.12.0.dev0
unity-tech-cn:/tag-0.12.1
unity-tech-cn:/2D-explorations
unity-tech-cn:/asymm-envs
unity-tech-cn:/tag-0.12.1.dev0
unity-tech-cn:/2D-exploration-raycast
unity-tech-cn:/tag-0.12.1.dev1
unity-tech-cn:/release-0.13.0
unity-tech-cn:/release-0.13.1
unity-tech-cn:/plugin-proof-of-concept
unity-tech-cn:/release-0.14.0
unity-tech-cn:/hotfix-bump-version-master
unity-tech-cn:/soccer-fives
unity-tech-cn:/release-0.14.1
unity-tech-cn:/bug-failed-api-check
unity-tech-cn:/test-recurrent-gail
unity-tech-cn:/hh-add-icons
unity-tech-cn:/release-0.15.0
unity-tech-cn:/release-0.15.1
unity-tech-cn:/hh-develop-all-posed-characters
unity-tech-cn:/internal-policy-ghost
unity-tech-cn:/distributed-training
unity-tech-cn:/hh-develop-improve_tennis
unity-tech-cn:/test-tf-ver
unity-tech-cn:/release_1_branch
unity-tech-cn:/tennis-time-horizon
unity-tech-cn:/whitepaper-experiments
unity-tech-cn:/r2v-yamato-linux
unity-tech-cn:/docs-update
unity-tech-cn:/release_2_branch
unity-tech-cn:/exp-mede
unity-tech-cn:/sensitivity
unity-tech-cn:/release_2_verified_load_fix
unity-tech-cn:/test-sampler
unity-tech-cn:/release_2_verified
unity-tech-cn:/hh-develop-ragdoll-testing
unity-tech-cn:/origin-develop-taggedobservations
unity-tech-cn:/MLA-1734-demo-provider
unity-tech-cn:/sampler-refactor-copy
unity-tech-cn:/PhysXArticulations20201Package
unity-tech-cn:/tag-com.unity.ml-agents_1.0.8
unity-tech-cn:/release_3_branch
unity-tech-cn:/github-actions
unity-tech-cn:/release_3_distributed
unity-tech-cn:/fix-batch-tennis
unity-tech-cn:/distributed-ppo-sac
unity-tech-cn:/gridworld-custom-obs
unity-tech-cn:/hw20-segmentation
unity-tech-cn:/hh-develop-gamedev-demo
unity-tech-cn:/active-variablespeed
unity-tech-cn:/release_4_branch
unity-tech-cn:/fix-env-step-loop
unity-tech-cn:/release_5_branch
unity-tech-cn:/fix-walker
unity-tech-cn:/release_6_branch
unity-tech-cn:/hh-32-observation-crawler
unity-tech-cn:/trainer-plugin
unity-tech-cn:/hh-develop-max-steps-demo-recorder
unity-tech-cn:/hh-develop-loco-walker-variable-speed
unity-tech-cn:/exp-0002
unity-tech-cn:/experiment-less-max-step
unity-tech-cn:/hh-develop-hallway-wall-mesh-fix
unity-tech-cn:/release_7_branch
unity-tech-cn:/exp-vince
unity-tech-cn:/hh-develop-gridsensor-tests
unity-tech-cn:/tag-release_8_test0
unity-tech-cn:/tag-release_8_test1
unity-tech-cn:/release_8_branch
unity-tech-cn:/docfix-end-episode
unity-tech-cn:/release_9_branch
unity-tech-cn:/hybrid-action-rewardsignals
unity-tech-cn:/MLA-462-yamato-win
unity-tech-cn:/exp-alternate-atten
unity-tech-cn:/hh-develop-fps_game_project
unity-tech-cn:/fix-conflict-base-env
unity-tech-cn:/release_10_branch
unity-tech-cn:/exp-bullet-hell-trainer
unity-tech-cn:/ai-summit-exp
unity-tech-cn:/comms-grad
unity-tech-cn:/walljump-pushblock
unity-tech-cn:/goal-conditioning
unity-tech-cn:/release_11_branch
unity-tech-cn:/hh-develop-water-balloon-fight
unity-tech-cn:/gc-hyper
unity-tech-cn:/layernorm
unity-tech-cn:/yamato-linux-debug-venv
unity-tech-cn:/soccer-comms
unity-tech-cn:/hh-develop-pushblockcollab
unity-tech-cn:/release_12_branch
unity-tech-cn:/fix-get-step-sp-curr
unity-tech-cn:/continuous-comms
unity-tech-cn:/no-comms
unity-tech-cn:/hh-develop-zombiepushblock
unity-tech-cn:/hypernetwork
unity-tech-cn:/revert-4859-develop-update-readme
unity-tech-cn:/sequencer-env-attention
unity-tech-cn:/hh-develop-variableobs
unity-tech-cn:/exp-tanh
unity-tech-cn:/reward-dist
unity-tech-cn:/exp-weight-decay
unity-tech-cn:/exp-robot
unity-tech-cn:/bullet-hell-barracuda-test-1.3.1
unity-tech-cn:/release_13_branch
unity-tech-cn:/release_14_branch
unity-tech-cn:/exp-clipped-gaussian-entropy
unity-tech-cn:/tic-tac-toe
unity-tech-cn:/hh-develop-dodgeball
unity-tech-cn:/repro-vis-obs-perf
unity-tech-cn:/v2-staging-rebase
unity-tech-cn:/release_15_branch
unity-tech-cn:/release_15_removeendepisode
unity-tech-cn:/release_16_branch
unity-tech-cn:/release_16_fix_gridsensor
unity-tech-cn:/ai-hw-2021
unity-tech-cn:/check-for-ModelOverriders
unity-tech-cn:/fix-grid-obs-shape-init
unity-tech-cn:/fix-gym-needs-reset
unity-tech-cn:/fix-resume-imi
unity-tech-cn:/release_17_branch
unity-tech-cn:/release_17_branch_gpu_test
unity-tech-cn:/colab-links
unity-tech-cn:/exp-continuous-div
unity-tech-cn:/release_17_branch_gpu_2
unity-tech-cn:/exp-diverse-behavior
unity-tech-cn:/grid-onehot-extra-dim-empty
unity-tech-cn:/2.0-verified
unity-tech-cn:/faster-entropy-coeficient-convergence
unity-tech-cn:/pre-r18-update-changelog
unity-tech-cn:/release_18_branch
unity-tech-cn:/main/tracking
unity-tech-cn:/main/reward-providers
unity-tech-cn:/main/project-upgrade
unity-tech-cn:/main/limitation-docs
unity-tech-cn:/develop/nomaxstep-test
unity-tech-cn:/develop/tf2.0
unity-tech-cn:/develop/tanhsquash
unity-tech-cn:/develop/magic-string
unity-tech-cn:/develop/trainerinterface
unity-tech-cn:/develop/separatevalue
unity-tech-cn:/develop/nopreviousactions
unity-tech-cn:/develop/reenablerepeatactions
unity-tech-cn:/develop/0memories
unity-tech-cn:/develop/fixmemoryleak
unity-tech-cn:/develop/reducewalljump
unity-tech-cn:/develop/removeactionholder-onehot
unity-tech-cn:/develop/canonicalize-quaternions
unity-tech-cn:/develop/self-playassym
unity-tech-cn:/develop/demo-load-seek
unity-tech-cn:/develop/progress-bar
unity-tech-cn:/develop/sac-apex
unity-tech-cn:/develop/cubewars
unity-tech-cn:/develop/add-fire
unity-tech-cn:/develop/gym-wrapper
unity-tech-cn:/develop/mm-docs-main-readme
unity-tech-cn:/develop/mm-docs-overview
unity-tech-cn:/develop/no-threading
unity-tech-cn:/develop/dockerfile
unity-tech-cn:/develop/model-store
unity-tech-cn:/develop/checkout-conversion-rebase
unity-tech-cn:/develop/model-transfer
unity-tech-cn:/develop/bisim-review
unity-tech-cn:/develop/taggedobservations
unity-tech-cn:/develop/transfer-bisim
unity-tech-cn:/develop/bisim-sac-transfer
unity-tech-cn:/develop/basketball
unity-tech-cn:/develop/torchmodules
unity-tech-cn:/develop/fixmarkdown
unity-tech-cn:/develop/shortenstrikervsgoalie
unity-tech-cn:/develop/shortengoalie
unity-tech-cn:/develop/torch-save-rp
unity-tech-cn:/develop/torch-to-np
unity-tech-cn:/develop/torch-omp-no-thread
unity-tech-cn:/develop/actionmodel-csharp
unity-tech-cn:/develop/torch-extra
unity-tech-cn:/develop/restructure-torch-networks
unity-tech-cn:/develop/jit
unity-tech-cn:/develop/adjust-cpu-settings-experiment
unity-tech-cn:/develop/torch-sac-threading
unity-tech-cn:/develop/wb
unity-tech-cn:/develop/amrl
unity-tech-cn:/develop/memorydump
unity-tech-cn:/develop/permutepytorch
unity-tech-cn:/develop/sac-targetq
unity-tech-cn:/develop/actions-out
unity-tech-cn:/develop/reshapeonnxmemories
unity-tech-cn:/develop/crawlergail
unity-tech-cn:/develop/debugtorchfood
unity-tech-cn:/develop/hybrid-actions
unity-tech-cn:/develop/bullet-hell
unity-tech-cn:/develop/action-spec-gym
unity-tech-cn:/develop/battlefoodcollector
unity-tech-cn:/develop/use-action-buffers
unity-tech-cn:/develop/hardswish
unity-tech-cn:/develop/leakyrelu
unity-tech-cn:/develop/torch-clip-scale
unity-tech-cn:/develop/contentropy
unity-tech-cn:/develop/manch
unity-tech-cn:/develop/torchcrawlerdebug
unity-tech-cn:/develop/fix-nan
unity-tech-cn:/develop/multitype-buffer
unity-tech-cn:/develop/windows-delay
unity-tech-cn:/develop/torch-tanh
unity-tech-cn:/develop/gail-norm
unity-tech-cn:/develop/multiprocess
unity-tech-cn:/develop/unified-obs
unity-tech-cn:/develop/rm-rf-new-models
unity-tech-cn:/develop/skipcritic
unity-tech-cn:/develop/centralizedcritic
unity-tech-cn:/develop/dodgeball-tests
unity-tech-cn:/develop/cc-teammanager
unity-tech-cn:/develop/weight-decay
unity-tech-cn:/develop/singular-embeddings
unity-tech-cn:/develop/zombieteammanager
unity-tech-cn:/develop/superpush
unity-tech-cn:/develop/teammanager
unity-tech-cn:/develop/zombie-exp
unity-tech-cn:/develop/update-readme
unity-tech-cn:/develop/readme-fix
unity-tech-cn:/develop/coma-noact
unity-tech-cn:/develop/coma-withq
unity-tech-cn:/develop/coma2
unity-tech-cn:/develop/action-slice
unity-tech-cn:/develop/gru
unity-tech-cn:/develop/critic-op-lstm-currentmem
unity-tech-cn:/develop/decaygail
unity-tech-cn:/develop/gail-srl-hack
unity-tech-cn:/develop/rear-pad
unity-tech-cn:/develop/mm-copyright-dates
unity-tech-cn:/develop/dodgeball-raycasts
unity-tech-cn:/develop/collab-envs-exp-ervin
unity-tech-cn:/develop/pushcollabonly
unity-tech-cn:/develop/sample-curation
unity-tech-cn:/develop/soccer-groupman
unity-tech-cn:/develop/input-actuator-tanks
unity-tech-cn:/develop/validate-release-fix
unity-tech-cn:/develop/new-console-log
unity-tech-cn:/develop/lex-walker-model
unity-tech-cn:/develop/lstm-burnin
unity-tech-cn:/develop/grid-vaiable-names
unity-tech-cn:/develop/fix-attn-embedding
unity-tech-cn:/develop/api-documentation-update-some-fixes
unity-tech-cn:/develop/update-grpc
unity-tech-cn:/develop/grid-rootref-debug
unity-tech-cn:/develop/pbcollab-rays
unity-tech-cn:/develop/2.0-verified-pre
unity-tech-cn:/develop/parameterizedenvs
unity-tech-cn:/develop/custom-ray-sensor
unity-tech-cn:/develop/mm-add-v2blog
unity-tech-cn:/develop/custom-raycast
unity-tech-cn:/develop/area-manager
unity-tech-cn:/develop/remove-unecessary-lr
unity-tech-cn:/develop/use-base-env-in-learn
unity-tech-cn:/soccer-fives/multiagent
unity-tech-cn:/develop/cubewars/splashdamage
unity-tech-cn:/develop/add-fire/exp
unity-tech-cn:/develop/add-fire/jit
unity-tech-cn:/develop/add-fire/speedtest
unity-tech-cn:/develop/add-fire/bc
unity-tech-cn:/develop/add-fire/ckpt-2
unity-tech-cn:/develop/add-fire/normalize-context
unity-tech-cn:/develop/add-fire/components-dir
unity-tech-cn:/develop/add-fire/halfentropy
unity-tech-cn:/develop/add-fire/memoryclass
unity-tech-cn:/develop/add-fire/categoricaldist
unity-tech-cn:/develop/add-fire/mm
unity-tech-cn:/develop/add-fire/sac-lst
unity-tech-cn:/develop/add-fire/mm3
unity-tech-cn:/develop/add-fire/continuous
unity-tech-cn:/develop/add-fire/ghost
unity-tech-cn:/develop/add-fire/policy-tests
unity-tech-cn:/develop/add-fire/export-discrete
unity-tech-cn:/develop/add-fire/test-simple-rl-fix-resnet
unity-tech-cn:/develop/add-fire/remove-currdoc
unity-tech-cn:/develop/add-fire/clean2
unity-tech-cn:/develop/add-fire/doc-cleanups
unity-tech-cn:/develop/add-fire/changelog
unity-tech-cn:/develop/add-fire/mm2
unity-tech-cn:/develop/model-transfer/add-physics
unity-tech-cn:/develop/model-transfer/train
unity-tech-cn:/develop/jit/experiments
unity-tech-cn:/exp-vince/sep30-2020
unity-tech-cn:/hh-develop-gridsensor-tests/static
unity-tech-cn:/develop/hybrid-actions/distlist
unity-tech-cn:/develop/bullet-hell/buffer
unity-tech-cn:/goal-conditioning/new
unity-tech-cn:/goal-conditioning/sensors-2
unity-tech-cn:/goal-conditioning/sensors-3-pytest-fix
unity-tech-cn:/goal-conditioning/grid-world
unity-tech-cn:/soccer-comms/disc
unity-tech-cn:/develop/centralizedcritic/counterfact
unity-tech-cn:/develop/centralizedcritic/mm
unity-tech-cn:/develop/centralizedcritic/nonego
unity-tech-cn:/develop/zombieteammanager/disableagent
unity-tech-cn:/develop/zombieteammanager/killfirst
unity-tech-cn:/develop/superpush/int
unity-tech-cn:/develop/superpush/branch-cleanup
unity-tech-cn:/develop/teammanager/int
unity-tech-cn:/develop/teammanager/cubewar-nocycle
unity-tech-cn:/develop/teammanager/cubewars
unity-tech-cn:/develop/superpush/int/hunter
unity-tech-cn:/goal-conditioning/new/allo-crawler
unity-tech-cn:/develop/coma2/clip
unity-tech-cn:/develop/coma2/singlenetwork
unity-tech-cn:/develop/coma2/samenet
unity-tech-cn:/develop/coma2/fixgroup
unity-tech-cn:/develop/coma2/samenet/sum
unity-tech-cn:/hh-develop-dodgeball/goy-input
unity-tech-cn:/develop/soccer-groupman/mod
unity-tech-cn:/develop/soccer-groupman/mod/hunter
unity-tech-cn:/develop/soccer-groupman/mod/hunter/cine
unity-tech-cn:/ai-hw-2021/tensor-applier
拉取从: unity-tech-cn:exp-mede
unity-tech-cn:/main
unity-tech-cn:/develop-generalizationTraining-TrainerController
unity-tech-cn:/tag-0.2.0
unity-tech-cn:/tag-0.2.1
unity-tech-cn:/tag-0.2.1a
unity-tech-cn:/tag-0.2.1c
unity-tech-cn:/tag-0.2.1d
unity-tech-cn:/hotfix-v0.9.2a
unity-tech-cn:/develop-gpu-test
unity-tech-cn:/0.10.1
unity-tech-cn:/develop-pyinstaller
unity-tech-cn:/develop-horovod
unity-tech-cn:/PhysXArticulations20201
unity-tech-cn:/importdocfix
unity-tech-cn:/develop-resizetexture
unity-tech-cn:/hh-develop-walljump_bugfixes
unity-tech-cn:/develop-walljump-fix-sac
unity-tech-cn:/hh-develop-walljump_rnd
unity-tech-cn:/tag-0.11.0.dev0
unity-tech-cn:/develop-pytorch
unity-tech-cn:/tag-0.11.0.dev2
unity-tech-cn:/develop-newnormalization
unity-tech-cn:/tag-0.11.0.dev3
unity-tech-cn:/develop
unity-tech-cn:/release-0.12.0
unity-tech-cn:/tag-0.12.0-dev
unity-tech-cn:/tag-0.12.0.dev0
unity-tech-cn:/tag-0.12.1
unity-tech-cn:/2D-explorations
unity-tech-cn:/asymm-envs
unity-tech-cn:/tag-0.12.1.dev0
unity-tech-cn:/2D-exploration-raycast
unity-tech-cn:/tag-0.12.1.dev1
unity-tech-cn:/release-0.13.0
unity-tech-cn:/release-0.13.1
unity-tech-cn:/plugin-proof-of-concept
unity-tech-cn:/release-0.14.0
unity-tech-cn:/hotfix-bump-version-master
unity-tech-cn:/soccer-fives
unity-tech-cn:/release-0.14.1
unity-tech-cn:/bug-failed-api-check
unity-tech-cn:/test-recurrent-gail
unity-tech-cn:/hh-add-icons
unity-tech-cn:/release-0.15.0
unity-tech-cn:/release-0.15.1
unity-tech-cn:/hh-develop-all-posed-characters
unity-tech-cn:/internal-policy-ghost
unity-tech-cn:/distributed-training
unity-tech-cn:/hh-develop-improve_tennis
unity-tech-cn:/test-tf-ver
unity-tech-cn:/release_1_branch
unity-tech-cn:/tennis-time-horizon
unity-tech-cn:/whitepaper-experiments
unity-tech-cn:/r2v-yamato-linux
unity-tech-cn:/docs-update
unity-tech-cn:/release_2_branch
unity-tech-cn:/exp-mede
unity-tech-cn:/sensitivity
unity-tech-cn:/release_2_verified_load_fix
unity-tech-cn:/test-sampler
unity-tech-cn:/release_2_verified
unity-tech-cn:/hh-develop-ragdoll-testing
unity-tech-cn:/origin-develop-taggedobservations
unity-tech-cn:/MLA-1734-demo-provider
unity-tech-cn:/sampler-refactor-copy
unity-tech-cn:/PhysXArticulations20201Package
unity-tech-cn:/tag-com.unity.ml-agents_1.0.8
unity-tech-cn:/release_3_branch
unity-tech-cn:/github-actions
unity-tech-cn:/release_3_distributed
unity-tech-cn:/fix-batch-tennis
unity-tech-cn:/distributed-ppo-sac
unity-tech-cn:/gridworld-custom-obs
unity-tech-cn:/hw20-segmentation
unity-tech-cn:/hh-develop-gamedev-demo
unity-tech-cn:/active-variablespeed
unity-tech-cn:/release_4_branch
unity-tech-cn:/fix-env-step-loop
unity-tech-cn:/release_5_branch
unity-tech-cn:/fix-walker
unity-tech-cn:/release_6_branch
unity-tech-cn:/hh-32-observation-crawler
unity-tech-cn:/trainer-plugin
unity-tech-cn:/hh-develop-max-steps-demo-recorder
unity-tech-cn:/hh-develop-loco-walker-variable-speed
unity-tech-cn:/exp-0002
unity-tech-cn:/experiment-less-max-step
unity-tech-cn:/hh-develop-hallway-wall-mesh-fix
unity-tech-cn:/release_7_branch
unity-tech-cn:/exp-vince
unity-tech-cn:/hh-develop-gridsensor-tests
unity-tech-cn:/tag-release_8_test0
unity-tech-cn:/tag-release_8_test1
unity-tech-cn:/release_8_branch
unity-tech-cn:/docfix-end-episode
unity-tech-cn:/release_9_branch
unity-tech-cn:/hybrid-action-rewardsignals
unity-tech-cn:/MLA-462-yamato-win
unity-tech-cn:/exp-alternate-atten
unity-tech-cn:/hh-develop-fps_game_project
unity-tech-cn:/fix-conflict-base-env
unity-tech-cn:/release_10_branch
unity-tech-cn:/exp-bullet-hell-trainer
unity-tech-cn:/ai-summit-exp
unity-tech-cn:/comms-grad
unity-tech-cn:/walljump-pushblock
unity-tech-cn:/goal-conditioning
unity-tech-cn:/release_11_branch
unity-tech-cn:/hh-develop-water-balloon-fight
unity-tech-cn:/gc-hyper
unity-tech-cn:/layernorm
unity-tech-cn:/yamato-linux-debug-venv
unity-tech-cn:/soccer-comms
unity-tech-cn:/hh-develop-pushblockcollab
unity-tech-cn:/release_12_branch
unity-tech-cn:/fix-get-step-sp-curr
unity-tech-cn:/continuous-comms
unity-tech-cn:/no-comms
unity-tech-cn:/hh-develop-zombiepushblock
unity-tech-cn:/hypernetwork
unity-tech-cn:/revert-4859-develop-update-readme
unity-tech-cn:/sequencer-env-attention
unity-tech-cn:/hh-develop-variableobs
unity-tech-cn:/exp-tanh
unity-tech-cn:/reward-dist
unity-tech-cn:/exp-weight-decay
unity-tech-cn:/exp-robot
unity-tech-cn:/bullet-hell-barracuda-test-1.3.1
unity-tech-cn:/release_13_branch
unity-tech-cn:/release_14_branch
unity-tech-cn:/exp-clipped-gaussian-entropy
unity-tech-cn:/tic-tac-toe
unity-tech-cn:/hh-develop-dodgeball
unity-tech-cn:/repro-vis-obs-perf
unity-tech-cn:/v2-staging-rebase
unity-tech-cn:/release_15_branch
unity-tech-cn:/release_15_removeendepisode
unity-tech-cn:/release_16_branch
unity-tech-cn:/release_16_fix_gridsensor
unity-tech-cn:/ai-hw-2021
unity-tech-cn:/check-for-ModelOverriders
unity-tech-cn:/fix-grid-obs-shape-init
unity-tech-cn:/fix-gym-needs-reset
unity-tech-cn:/fix-resume-imi
unity-tech-cn:/release_17_branch
unity-tech-cn:/release_17_branch_gpu_test
unity-tech-cn:/colab-links
unity-tech-cn:/exp-continuous-div
unity-tech-cn:/release_17_branch_gpu_2
unity-tech-cn:/exp-diverse-behavior
unity-tech-cn:/grid-onehot-extra-dim-empty
unity-tech-cn:/2.0-verified
unity-tech-cn:/faster-entropy-coeficient-convergence
unity-tech-cn:/pre-r18-update-changelog
unity-tech-cn:/release_18_branch
unity-tech-cn:/main/tracking
unity-tech-cn:/main/reward-providers
unity-tech-cn:/main/project-upgrade
unity-tech-cn:/main/limitation-docs
unity-tech-cn:/develop/nomaxstep-test
unity-tech-cn:/develop/tf2.0
unity-tech-cn:/develop/tanhsquash
unity-tech-cn:/develop/magic-string
unity-tech-cn:/develop/trainerinterface
unity-tech-cn:/develop/separatevalue
unity-tech-cn:/develop/nopreviousactions
unity-tech-cn:/develop/reenablerepeatactions
unity-tech-cn:/develop/0memories
unity-tech-cn:/develop/fixmemoryleak
unity-tech-cn:/develop/reducewalljump
unity-tech-cn:/develop/removeactionholder-onehot
unity-tech-cn:/develop/canonicalize-quaternions
unity-tech-cn:/develop/self-playassym
unity-tech-cn:/develop/demo-load-seek
unity-tech-cn:/develop/progress-bar
unity-tech-cn:/develop/sac-apex
unity-tech-cn:/develop/cubewars
unity-tech-cn:/develop/add-fire
unity-tech-cn:/develop/gym-wrapper
unity-tech-cn:/develop/mm-docs-main-readme
unity-tech-cn:/develop/mm-docs-overview
unity-tech-cn:/develop/no-threading
unity-tech-cn:/develop/dockerfile
unity-tech-cn:/develop/model-store
unity-tech-cn:/develop/checkout-conversion-rebase
unity-tech-cn:/develop/model-transfer
unity-tech-cn:/develop/bisim-review
unity-tech-cn:/develop/taggedobservations
unity-tech-cn:/develop/transfer-bisim
unity-tech-cn:/develop/bisim-sac-transfer
unity-tech-cn:/develop/basketball
unity-tech-cn:/develop/torchmodules
unity-tech-cn:/develop/fixmarkdown
unity-tech-cn:/develop/shortenstrikervsgoalie
unity-tech-cn:/develop/shortengoalie
unity-tech-cn:/develop/torch-save-rp
unity-tech-cn:/develop/torch-to-np
unity-tech-cn:/develop/torch-omp-no-thread
unity-tech-cn:/develop/actionmodel-csharp
unity-tech-cn:/develop/torch-extra
unity-tech-cn:/develop/restructure-torch-networks
unity-tech-cn:/develop/jit
unity-tech-cn:/develop/adjust-cpu-settings-experiment
unity-tech-cn:/develop/torch-sac-threading
unity-tech-cn:/develop/wb
unity-tech-cn:/develop/amrl
unity-tech-cn:/develop/memorydump
unity-tech-cn:/develop/permutepytorch
unity-tech-cn:/develop/sac-targetq
unity-tech-cn:/develop/actions-out
unity-tech-cn:/develop/reshapeonnxmemories
unity-tech-cn:/develop/crawlergail
unity-tech-cn:/develop/debugtorchfood
unity-tech-cn:/develop/hybrid-actions
unity-tech-cn:/develop/bullet-hell
unity-tech-cn:/develop/action-spec-gym
unity-tech-cn:/develop/battlefoodcollector
unity-tech-cn:/develop/use-action-buffers
unity-tech-cn:/develop/hardswish
unity-tech-cn:/develop/leakyrelu
unity-tech-cn:/develop/torch-clip-scale
unity-tech-cn:/develop/contentropy
unity-tech-cn:/develop/manch
unity-tech-cn:/develop/torchcrawlerdebug
unity-tech-cn:/develop/fix-nan
unity-tech-cn:/develop/multitype-buffer
unity-tech-cn:/develop/windows-delay
unity-tech-cn:/develop/torch-tanh
unity-tech-cn:/develop/gail-norm
unity-tech-cn:/develop/multiprocess
unity-tech-cn:/develop/unified-obs
unity-tech-cn:/develop/rm-rf-new-models
unity-tech-cn:/develop/skipcritic
unity-tech-cn:/develop/centralizedcritic
unity-tech-cn:/develop/dodgeball-tests
unity-tech-cn:/develop/cc-teammanager
unity-tech-cn:/develop/weight-decay
unity-tech-cn:/develop/singular-embeddings
unity-tech-cn:/develop/zombieteammanager
unity-tech-cn:/develop/superpush
unity-tech-cn:/develop/teammanager
unity-tech-cn:/develop/zombie-exp
unity-tech-cn:/develop/update-readme
unity-tech-cn:/develop/readme-fix
unity-tech-cn:/develop/coma-noact
unity-tech-cn:/develop/coma-withq
unity-tech-cn:/develop/coma2
unity-tech-cn:/develop/action-slice
unity-tech-cn:/develop/gru
unity-tech-cn:/develop/critic-op-lstm-currentmem
unity-tech-cn:/develop/decaygail
unity-tech-cn:/develop/gail-srl-hack
unity-tech-cn:/develop/rear-pad
unity-tech-cn:/develop/mm-copyright-dates
unity-tech-cn:/develop/dodgeball-raycasts
unity-tech-cn:/develop/collab-envs-exp-ervin
unity-tech-cn:/develop/pushcollabonly
unity-tech-cn:/develop/sample-curation
unity-tech-cn:/develop/soccer-groupman
unity-tech-cn:/develop/input-actuator-tanks
unity-tech-cn:/develop/validate-release-fix
unity-tech-cn:/develop/new-console-log
unity-tech-cn:/develop/lex-walker-model
unity-tech-cn:/develop/lstm-burnin
unity-tech-cn:/develop/grid-vaiable-names
unity-tech-cn:/develop/fix-attn-embedding
unity-tech-cn:/develop/api-documentation-update-some-fixes
unity-tech-cn:/develop/update-grpc
unity-tech-cn:/develop/grid-rootref-debug
unity-tech-cn:/develop/pbcollab-rays
unity-tech-cn:/develop/2.0-verified-pre
unity-tech-cn:/develop/parameterizedenvs
unity-tech-cn:/develop/custom-ray-sensor
unity-tech-cn:/develop/mm-add-v2blog
unity-tech-cn:/develop/custom-raycast
unity-tech-cn:/develop/area-manager
unity-tech-cn:/develop/remove-unecessary-lr
unity-tech-cn:/develop/use-base-env-in-learn
unity-tech-cn:/soccer-fives/multiagent
unity-tech-cn:/develop/cubewars/splashdamage
unity-tech-cn:/develop/add-fire/exp
unity-tech-cn:/develop/add-fire/jit
unity-tech-cn:/develop/add-fire/speedtest
unity-tech-cn:/develop/add-fire/bc
unity-tech-cn:/develop/add-fire/ckpt-2
unity-tech-cn:/develop/add-fire/normalize-context
unity-tech-cn:/develop/add-fire/components-dir
unity-tech-cn:/develop/add-fire/halfentropy
unity-tech-cn:/develop/add-fire/memoryclass
unity-tech-cn:/develop/add-fire/categoricaldist
unity-tech-cn:/develop/add-fire/mm
unity-tech-cn:/develop/add-fire/sac-lst
unity-tech-cn:/develop/add-fire/mm3
unity-tech-cn:/develop/add-fire/continuous
unity-tech-cn:/develop/add-fire/ghost
unity-tech-cn:/develop/add-fire/policy-tests
unity-tech-cn:/develop/add-fire/export-discrete
unity-tech-cn:/develop/add-fire/test-simple-rl-fix-resnet
unity-tech-cn:/develop/add-fire/remove-currdoc
unity-tech-cn:/develop/add-fire/clean2
unity-tech-cn:/develop/add-fire/doc-cleanups
unity-tech-cn:/develop/add-fire/changelog
unity-tech-cn:/develop/add-fire/mm2
unity-tech-cn:/develop/model-transfer/add-physics
unity-tech-cn:/develop/model-transfer/train
unity-tech-cn:/develop/jit/experiments
unity-tech-cn:/exp-vince/sep30-2020
unity-tech-cn:/hh-develop-gridsensor-tests/static
unity-tech-cn:/develop/hybrid-actions/distlist
unity-tech-cn:/develop/bullet-hell/buffer
unity-tech-cn:/goal-conditioning/new
unity-tech-cn:/goal-conditioning/sensors-2
unity-tech-cn:/goal-conditioning/sensors-3-pytest-fix
unity-tech-cn:/goal-conditioning/grid-world
unity-tech-cn:/soccer-comms/disc
unity-tech-cn:/develop/centralizedcritic/counterfact
unity-tech-cn:/develop/centralizedcritic/mm
unity-tech-cn:/develop/centralizedcritic/nonego
unity-tech-cn:/develop/zombieteammanager/disableagent
unity-tech-cn:/develop/zombieteammanager/killfirst
unity-tech-cn:/develop/superpush/int
unity-tech-cn:/develop/superpush/branch-cleanup
unity-tech-cn:/develop/teammanager/int
unity-tech-cn:/develop/teammanager/cubewar-nocycle
unity-tech-cn:/develop/teammanager/cubewars
unity-tech-cn:/develop/superpush/int/hunter
unity-tech-cn:/goal-conditioning/new/allo-crawler
unity-tech-cn:/develop/coma2/clip
unity-tech-cn:/develop/coma2/singlenetwork
unity-tech-cn:/develop/coma2/samenet
unity-tech-cn:/develop/coma2/fixgroup
unity-tech-cn:/develop/coma2/samenet/sum
unity-tech-cn:/hh-develop-dodgeball/goy-input
unity-tech-cn:/develop/soccer-groupman/mod
unity-tech-cn:/develop/soccer-groupman/mod/hunter
unity-tech-cn:/develop/soccer-groupman/mod/hunter/cine
unity-tech-cn:/ai-hw-2021/tensor-applier
2 次代码提交
作者 | SHA1 | 备注 | 提交日期 |
---|---|---|---|
Andrew Cohen | 88153b61 | add mede opt with format | 5 年前 |
Andrew Cohen | 704d0d11 | add mede optimizer | 5 年前 |
共有 1 个文件被更改,包括 697 次插入 和 0 次删除
|
|||
import numpy as np |
|||
from typing import Dict, List, Optional, Any, Mapping |
|||
|
|||
from mlagents.tf_utils import tf |
|||
|
|||
from mlagents_envs.logging_util import get_logger |
|||
from mlagents.trainers.sac.network import SACPolicyNetwork, SACTargetNetwork |
|||
from mlagents.trainers.models import ScheduleType, EncoderType, ModelUtils |
|||
from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer |
|||
from mlagents.trainers.policy.tf_policy import TFPolicy |
|||
from mlagents.trainers.buffer import AgentBuffer |
|||
from mlagents_envs.timers import timed |
|||
|
|||
EPSILON = 1e-6 # Small value to avoid divide by zero |
|||
|
|||
logger = get_logger(__name__) |
|||
|
|||
POLICY_SCOPE = "" |
|||
TARGET_SCOPE = "target_network" |
|||
|
|||
|
|||
class MEDEOptimizer(TFOptimizer): |
|||
def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]): |
|||
""" |
|||
Takes a Unity environment and model-specific hyper-parameters and returns the |
|||
appropriate PPO agent model for the environment. |
|||
:param brain: Brain parameters used to generate specific network graph. |
|||
:param lr: Learning rate. |
|||
:param lr_schedule: Learning rate decay schedule. |
|||
:param h_size: Size of hidden layers |
|||
:param init_entcoef: Initial value for entropy coefficient. Set lower to learn faster, |
|||
set higher to explore more. |
|||
:return: a sub-class of PPOAgent tailored to the environment. |
|||
:param max_step: Total number of training steps. |
|||
:param normalize: Whether to normalize vector observation input. |
|||
:param use_recurrent: Whether to use an LSTM layer in the network. |
|||
:param num_layers: Number of hidden layers between encoded input and policy & value layers |
|||
:param tau: Strength of soft-Q update. |
|||
:param m_size: Size of brain memory. |
|||
""" |
|||
# Create the graph here to give more granular control of the TF graph to the Optimizer. |
|||
policy.create_tf_graph() |
|||
|
|||
with policy.graph.as_default(): |
|||
with tf.variable_scope(""): |
|||
super().__init__(policy, trainer_params) |
|||
lr = float(trainer_params["learning_rate"]) |
|||
lr_schedule = ScheduleType( |
|||
trainer_params.get("learning_rate_schedule", "constant") |
|||
) |
|||
self.policy = policy |
|||
self.act_size = self.policy.act_size |
|||
h_size = int(trainer_params["hidden_units"]) |
|||
max_step = float(trainer_params["max_steps"]) |
|||
num_layers = int(trainer_params["num_layers"]) |
|||
vis_encode_type = EncoderType( |
|||
trainer_params.get("vis_encode_type", "simple") |
|||
) |
|||
self.tau = trainer_params.get("tau", 0.005) |
|||
self.burn_in_ratio = float(trainer_params.get("burn_in_ratio", 0.0)) |
|||
self.num_diverse = int(trainer_params.get("mede", 10)) |
|||
|
|||
# Non-exposed SAC parameters |
|||
self.discrete_target_entropy_scale = ( |
|||
0.2 |
|||
) # Roughly equal to e-greedy 0.05 |
|||
self.continuous_target_entropy_scale = 1.0 |
|||
|
|||
self.init_entcoef = trainer_params.get("init_entcoef", 1.0) |
|||
stream_names = list(self.reward_signals.keys()) |
|||
# Use to reduce "survivor bonus" when using Curiosity or GAIL. |
|||
self.gammas = [ |
|||
_val["gamma"] for _val in trainer_params["reward_signals"].values() |
|||
] |
|||
self.use_dones_in_backup = { |
|||
name: tf.Variable(1.0) for name in stream_names |
|||
} |
|||
self.disable_use_dones = { |
|||
name: self.use_dones_in_backup[name].assign(0.0) |
|||
for name in stream_names |
|||
} |
|||
|
|||
if num_layers < 1: |
|||
num_layers = 1 |
|||
|
|||
self.target_init_op: List[tf.Tensor] = [] |
|||
self.target_update_op: List[tf.Tensor] = [] |
|||
self.update_batch_disc: Optional[tf.Operation] = None |
|||
self.update_batch_policy: Optional[tf.Operation] = None |
|||
self.update_batch_value: Optional[tf.Operation] = None |
|||
self.update_batch_entropy: Optional[tf.Operation] = None |
|||
|
|||
self.policy_network = SACPolicyNetwork( |
|||
policy=self.policy, |
|||
m_size=self.policy.m_size, # 3x policy.m_size |
|||
h_size=h_size, |
|||
normalize=self.policy.normalize, |
|||
use_recurrent=self.policy.use_recurrent, |
|||
num_layers=num_layers, |
|||
stream_names=stream_names, |
|||
vis_encode_type=vis_encode_type, |
|||
) |
|||
self.target_network = SACTargetNetwork( |
|||
policy=self.policy, |
|||
m_size=self.policy.m_size, # 1x policy.m_size |
|||
h_size=h_size, |
|||
normalize=self.policy.normalize, |
|||
use_recurrent=self.policy.use_recurrent, |
|||
num_layers=num_layers, |
|||
stream_names=stream_names, |
|||
vis_encode_type=vis_encode_type, |
|||
) |
|||
obs, self._z_one_hot = self._split(self.policy.vector_in) |
|||
self.disc = ModelUtils.create_discriminator( |
|||
obs, |
|||
self.num_diverse, |
|||
action_input=self.policy_network.external_action_in, |
|||
) |
|||
self.discp = None |
|||
|
|||
if self.policy.use_continuous_act: |
|||
self.discp = ModelUtils.create_discriminator( |
|||
obs, self.num_diverse, action_input=self.policy.output |
|||
) |
|||
|
|||
# The optimizer's m_size is 3 times the policy (Q1, Q2, and Value) |
|||
self.m_size = 3 * self.policy.m_size |
|||
self._create_inputs_and_outputs() |
|||
self.learning_rate = ModelUtils.create_schedule( |
|||
lr_schedule, |
|||
lr, |
|||
self.policy.global_step, |
|||
int(max_step), |
|||
min_value=1e-10, |
|||
) |
|||
self._create_losses( |
|||
self.policy_network.q1_heads, |
|||
self.policy_network.q2_heads, |
|||
lr, |
|||
int(max_step), |
|||
stream_names, |
|||
discrete=not self.policy.use_continuous_act, |
|||
) |
|||
self._create_sac_optimizer_ops() |
|||
|
|||
self.selected_actions = ( |
|||
self.policy.selected_actions |
|||
) # For GAIL and other reward signals |
|||
if self.policy.normalize: |
|||
target_update_norm = self.target_network.copy_normalization( |
|||
self.policy.running_mean, |
|||
self.policy.running_variance, |
|||
self.policy.normalization_steps, |
|||
) |
|||
# Update the normalization of the optimizer when the policy does. |
|||
self.policy.update_normalization_op = tf.group( |
|||
[self.policy.update_normalization_op, target_update_norm] |
|||
) |
|||
|
|||
self.policy.initialize_or_load() |
|||
|
|||
self.stats_name_to_update_name = { |
|||
"Losses/Value Loss": "value_loss", |
|||
"Losses/Policy Loss": "policy_loss", |
|||
"Losses/Q1 Loss": "q1_loss", |
|||
"Losses/Q2 Loss": "q2_loss", |
|||
"Losses/Discriminator Loss": "disc_loss", |
|||
"Policy/Entropy Coeff": "entropy_coef", |
|||
"Policy/Learning Rate": "learning_rate", |
|||
"Policy/Discriminability": "discriminability", |
|||
} |
|||
|
|||
self.update_dict = { |
|||
"value_loss": self.total_value_loss, |
|||
"policy_loss": self.policy_loss, |
|||
"disc_loss": self.disc_loss, |
|||
"discriminability": self.discriminability, |
|||
"q1_loss": self.q1_loss, |
|||
"q2_loss": self.q2_loss, |
|||
"entropy_coef": self.ent_coef, |
|||
"update_batch": self.update_batch_policy, |
|||
"update_value": self.update_batch_value, |
|||
"update_entropy": self.update_batch_entropy, |
|||
"update_disc": self.update_batch_disc, |
|||
"learning_rate": self.learning_rate, |
|||
} |
|||
|
|||
def _split(self, observation_and_skill: tf.Tensor) -> List[tf.Tensor]: |
|||
return tf.split( |
|||
observation_and_skill, |
|||
[self.policy.vec_obs_size - self.num_diverse, self.num_diverse], |
|||
1, |
|||
) |
|||
|
|||
def _create_inputs_and_outputs(self) -> None: |
|||
""" |
|||
Assign the higher-level SACModel's inputs and outputs to those of its policy or |
|||
target network. |
|||
""" |
|||
self.vector_in = self.policy.vector_in |
|||
self.visual_in = self.policy.visual_in |
|||
self.next_vector_in = self.target_network.vector_in |
|||
self.next_visual_in = self.target_network.visual_in |
|||
self.sequence_length_ph = self.policy.sequence_length_ph |
|||
self.next_sequence_length_ph = self.target_network.sequence_length_ph |
|||
if not self.policy.use_continuous_act: |
|||
self.action_masks = self.policy_network.action_masks |
|||
else: |
|||
self.output_pre = self.policy_network.output_pre |
|||
|
|||
# Don't use value estimate during inference. |
|||
self.value = tf.identity( |
|||
self.policy_network.value, name="value_estimate_unused" |
|||
) |
|||
self.value_heads = self.policy_network.value_heads |
|||
self.dones_holder = tf.placeholder( |
|||
shape=[None], dtype=tf.float32, name="dones_holder" |
|||
) |
|||
|
|||
if self.policy.use_recurrent: |
|||
self.memory_in = self.policy_network.memory_in |
|||
self.memory_out = self.policy_network.memory_out |
|||
if not self.policy.use_continuous_act: |
|||
self.prev_action = self.policy_network.prev_action |
|||
self.next_memory_in = self.target_network.memory_in |
|||
|
|||
def _create_losses( |
|||
self, |
|||
q1_streams: Dict[str, tf.Tensor], |
|||
q2_streams: Dict[str, tf.Tensor], |
|||
lr: tf.Tensor, |
|||
max_step: int, |
|||
stream_names: List[str], |
|||
discrete: bool = False, |
|||
) -> None: |
|||
""" |
|||
Creates training-specific Tensorflow ops for SAC models. |
|||
:param q1_streams: Q1 streams from policy network |
|||
:param q1_streams: Q2 streams from policy network |
|||
:param lr: Learning rate |
|||
:param max_step: Total number of training steps. |
|||
:param stream_names: List of reward stream names. |
|||
:param discrete: Whether or not to use discrete action losses. |
|||
""" |
|||
|
|||
if discrete: |
|||
self.target_entropy = [ |
|||
self.discrete_target_entropy_scale * np.log(i).astype(np.float32) |
|||
for i in self.act_size |
|||
] |
|||
discrete_action_probs = tf.exp(self.policy.all_log_probs) |
|||
per_action_entropy = discrete_action_probs * self.policy.all_log_probs |
|||
else: |
|||
self.target_entropy = ( |
|||
-1 |
|||
* self.continuous_target_entropy_scale |
|||
* np.prod(self.act_size[0]).astype(np.float32) |
|||
) |
|||
|
|||
self.rewards_holders = {} |
|||
self.min_policy_qs = {} |
|||
|
|||
# discriminator loss |
|||
self.disc_loss = tf.reduce_mean( |
|||
tf.nn.softmax_cross_entropy_with_logits( |
|||
labels=self._z_one_hot, logits=self.disc |
|||
) |
|||
) |
|||
discriminabilityp = None |
|||
|
|||
if self.policy.use_continuous_act: |
|||
discriminabilityp = -1 * tf.nn.softmax_cross_entropy_with_logits( |
|||
labels=self._z_one_hot, logits=self.discp |
|||
) |
|||
self.discriminability = -1 * tf.nn.softmax_cross_entropy_with_logits( |
|||
labels=self._z_one_hot, logits=self.disc |
|||
) |
|||
|
|||
for name in stream_names: |
|||
if discrete: |
|||
_branched_mpq1 = ModelUtils.break_into_branches( |
|||
self.policy_network.q1_pheads[name] * discrete_action_probs, |
|||
self.act_size, |
|||
) |
|||
branched_mpq1 = tf.stack( |
|||
[ |
|||
tf.reduce_sum(_br, axis=1, keep_dims=True) |
|||
for _br in _branched_mpq1 |
|||
] |
|||
) |
|||
_q1_p_mean = tf.reduce_mean(branched_mpq1, axis=0) |
|||
|
|||
_branched_mpq2 = ModelUtils.break_into_branches( |
|||
self.policy_network.q2_pheads[name] * discrete_action_probs, |
|||
self.act_size, |
|||
) |
|||
branched_mpq2 = tf.stack( |
|||
[ |
|||
tf.reduce_sum(_br, axis=1, keep_dims=True) |
|||
for _br in _branched_mpq2 |
|||
] |
|||
) |
|||
_q2_p_mean = tf.reduce_mean(branched_mpq2, axis=0) |
|||
|
|||
self.min_policy_qs[name] = tf.minimum(_q1_p_mean, _q2_p_mean) |
|||
else: |
|||
self.min_policy_qs[name] = tf.minimum( |
|||
self.policy_network.q1_pheads[name], |
|||
self.policy_network.q2_pheads[name], |
|||
) |
|||
|
|||
rewards_holder = tf.placeholder( |
|||
shape=[None], dtype=tf.float32, name="{}_rewards".format(name) |
|||
) |
|||
self.rewards_holders[name] = rewards_holder |
|||
|
|||
q1_losses = [] |
|||
q2_losses = [] |
|||
# Multiple q losses per stream |
|||
expanded_dones = tf.expand_dims(self.dones_holder, axis=-1) |
|||
for i, name in enumerate(stream_names): |
|||
_expanded_rewards = tf.expand_dims(self.rewards_holders[name], axis=-1) |
|||
|
|||
q_backup = tf.stop_gradient( |
|||
_expanded_rewards |
|||
+ (1.0 - self.use_dones_in_backup[name] * expanded_dones) |
|||
* self.gammas[i] |
|||
* self.target_network.value_heads[name] |
|||
) |
|||
|
|||
if discrete: |
|||
# We need to break up the Q functions by branch, and update them individually. |
|||
branched_q1_stream = ModelUtils.break_into_branches( |
|||
self.policy.selected_actions * q1_streams[name], self.act_size |
|||
) |
|||
branched_q2_stream = ModelUtils.break_into_branches( |
|||
self.policy.selected_actions * q2_streams[name], self.act_size |
|||
) |
|||
|
|||
# Reduce each branch into scalar |
|||
branched_q1_stream = [ |
|||
tf.reduce_sum(_branch, axis=1, keep_dims=True) |
|||
for _branch in branched_q1_stream |
|||
] |
|||
branched_q2_stream = [ |
|||
tf.reduce_sum(_branch, axis=1, keep_dims=True) |
|||
for _branch in branched_q2_stream |
|||
] |
|||
|
|||
q1_stream = tf.reduce_mean(branched_q1_stream, axis=0) |
|||
q2_stream = tf.reduce_mean(branched_q2_stream, axis=0) |
|||
|
|||
else: |
|||
q1_stream = q1_streams[name] |
|||
q2_stream = q2_streams[name] |
|||
|
|||
_q1_loss = 0.5 * tf.reduce_mean( |
|||
tf.to_float(self.policy.mask) |
|||
* tf.squared_difference(q_backup, q1_stream) |
|||
) |
|||
|
|||
_q2_loss = 0.5 * tf.reduce_mean( |
|||
tf.to_float(self.policy.mask) |
|||
* tf.squared_difference(q_backup, q2_stream) |
|||
) |
|||
|
|||
q1_losses.append(_q1_loss) |
|||
q2_losses.append(_q2_loss) |
|||
|
|||
self.q1_loss = tf.reduce_mean(q1_losses) |
|||
self.q2_loss = tf.reduce_mean(q2_losses) |
|||
|
|||
# Learn entropy coefficient |
|||
if discrete: |
|||
# Create a log_ent_coef for each branch |
|||
self.log_ent_coef = tf.get_variable( |
|||
"log_ent_coef", |
|||
dtype=tf.float32, |
|||
initializer=np.log([self.init_entcoef] * len(self.act_size)).astype( |
|||
np.float32 |
|||
), |
|||
trainable=True, |
|||
) |
|||
else: |
|||
self.log_ent_coef = tf.get_variable( |
|||
"log_ent_coef", |
|||
dtype=tf.float32, |
|||
initializer=np.log(self.init_entcoef).astype(np.float32), |
|||
trainable=True, |
|||
) |
|||
|
|||
self.ent_coef = tf.exp(self.log_ent_coef) |
|||
if discrete: |
|||
# We also have to do a different entropy and target_entropy per branch. |
|||
branched_per_action_ent = ModelUtils.break_into_branches( |
|||
per_action_entropy, self.act_size |
|||
) |
|||
branched_ent_sums = tf.stack( |
|||
[ |
|||
tf.reduce_sum(_lp, axis=1, keep_dims=True) + _te |
|||
for _lp, _te in zip(branched_per_action_ent, self.target_entropy) |
|||
], |
|||
axis=1, |
|||
) |
|||
self.entropy_loss = -tf.reduce_mean( |
|||
tf.to_float(self.policy.mask) |
|||
* tf.reduce_mean( |
|||
self.log_ent_coef |
|||
* tf.squeeze(tf.stop_gradient(branched_ent_sums), axis=2), |
|||
axis=1, |
|||
) |
|||
) |
|||
|
|||
# Same with policy loss, we have to do the loss per branch and average them, |
|||
# so that larger branches don't get more weight. |
|||
# The equivalent KL divergence from Eq 10 of Haarnoja et al. is also pi*log(pi) - Q |
|||
branched_q_term = ModelUtils.break_into_branches( |
|||
discrete_action_probs * self.policy_network.q1_p, self.act_size |
|||
) |
|||
|
|||
branched_policy_loss = tf.stack( |
|||
[ |
|||
tf.reduce_sum(self.ent_coef[i] * _lp - _qt, axis=1, keep_dims=True) |
|||
for i, (_lp, _qt) in enumerate( |
|||
zip(branched_per_action_ent, branched_q_term) |
|||
) |
|||
] |
|||
) |
|||
self.policy_loss = tf.reduce_mean( |
|||
tf.to_float(self.policy.mask) * tf.squeeze(branched_policy_loss) |
|||
- self.discriminability |
|||
) |
|||
|
|||
# Do vbackup entropy bonus per branch as well. |
|||
branched_ent_bonus = tf.stack( |
|||
[ |
|||
tf.reduce_sum(self.ent_coef[i] * _lp, axis=1, keep_dims=True) |
|||
for i, _lp in enumerate(branched_per_action_ent) |
|||
] |
|||
) |
|||
value_losses = [] |
|||
for name in stream_names: |
|||
v_backup = tf.stop_gradient( |
|||
self.min_policy_qs[name] |
|||
+ self.discriminability |
|||
- tf.reduce_mean(branched_ent_bonus, axis=0) |
|||
) |
|||
value_losses.append( |
|||
0.5 |
|||
* tf.reduce_mean( |
|||
tf.to_float(self.policy.mask) |
|||
* tf.squared_difference( |
|||
self.policy_network.value_heads[name], v_backup |
|||
) |
|||
) |
|||
) |
|||
|
|||
else: |
|||
self.entropy_loss = -tf.reduce_mean( |
|||
self.log_ent_coef |
|||
* tf.to_float(self.policy.mask) |
|||
* tf.stop_gradient( |
|||
tf.reduce_sum( |
|||
self.policy.all_log_probs + self.target_entropy, |
|||
axis=1, |
|||
keep_dims=True, |
|||
) |
|||
) |
|||
) |
|||
batch_policy_loss = tf.reduce_mean( |
|||
self.ent_coef * self.policy.all_log_probs - self.policy_network.q1_p, |
|||
axis=1, |
|||
) |
|||
self.policy_loss = tf.reduce_mean( |
|||
tf.to_float(self.policy.mask) * batch_policy_loss - discriminabilityp |
|||
) |
|||
|
|||
value_losses = [] |
|||
for name in stream_names: |
|||
v_backup = tf.stop_gradient( |
|||
self.min_policy_qs[name] |
|||
+ self.discriminability |
|||
- tf.reduce_sum(self.ent_coef * self.policy.all_log_probs, axis=1) |
|||
) |
|||
value_losses.append( |
|||
0.5 |
|||
* tf.reduce_mean( |
|||
tf.to_float(self.policy.mask) |
|||
* tf.squared_difference( |
|||
self.policy_network.value_heads[name], v_backup |
|||
) |
|||
) |
|||
) |
|||
self.value_loss = tf.reduce_mean(value_losses) |
|||
|
|||
self.total_value_loss = self.q1_loss + self.q2_loss + self.value_loss |
|||
|
|||
self.entropy = self.policy_network.entropy |
|||
|
|||
def _create_sac_optimizer_ops(self) -> None: |
|||
""" |
|||
Creates the Adam optimizers and update ops for SAC, including |
|||
the policy, value, and entropy updates, as well as the target network update. |
|||
""" |
|||
policy_optimizer = self.create_optimizer_op( |
|||
learning_rate=self.learning_rate, name="sac_policy_opt" |
|||
) |
|||
entropy_optimizer = self.create_optimizer_op( |
|||
learning_rate=self.learning_rate, name="sac_entropy_opt" |
|||
) |
|||
value_optimizer = self.create_optimizer_op( |
|||
learning_rate=self.learning_rate, name="sac_value_opt" |
|||
) |
|||
|
|||
discriminator_optimizer = self.create_optimizer_op( |
|||
learning_rate=self.learning_rate, name="mede_disc_opt" |
|||
) |
|||
|
|||
self.target_update_op = [ |
|||
tf.assign(target, (1 - self.tau) * target + self.tau * source) |
|||
for target, source in zip( |
|||
self.target_network.value_vars, self.policy_network.value_vars |
|||
) |
|||
] |
|||
logger.debug("value_vars") |
|||
self.print_all_vars(self.policy_network.value_vars) |
|||
logger.debug("targvalue_vars") |
|||
self.print_all_vars(self.target_network.value_vars) |
|||
logger.debug("critic_vars") |
|||
self.print_all_vars(self.policy_network.critic_vars) |
|||
logger.debug("q_vars") |
|||
self.print_all_vars(self.policy_network.q_vars) |
|||
logger.debug("policy_vars") |
|||
policy_vars = self.policy.get_trainable_variables() |
|||
self.print_all_vars(policy_vars) |
|||
|
|||
self.target_init_op = [ |
|||
tf.assign(target, source) |
|||
for target, source in zip( |
|||
self.target_network.value_vars, self.policy_network.value_vars |
|||
) |
|||
] |
|||
|
|||
discriminator_vars = tf.get_collection( |
|||
tf.GraphKeys.TRAINABLE_VARIABLES, scope="discriminator" |
|||
) |
|||
|
|||
self.update_batch_disc = discriminator_optimizer.minimize( |
|||
self.disc_loss, var_list=discriminator_vars |
|||
) |
|||
|
|||
self.update_batch_policy = policy_optimizer.minimize( |
|||
self.policy_loss, var_list=policy_vars |
|||
) |
|||
|
|||
# Make sure policy is updated first, then value, then entropy. |
|||
with tf.control_dependencies([self.update_batch_policy]): |
|||
self.update_batch_value = value_optimizer.minimize( |
|||
self.total_value_loss, var_list=self.policy_network.critic_vars |
|||
) |
|||
# Add entropy coefficient optimization operation |
|||
with tf.control_dependencies([self.update_batch_value]): |
|||
self.update_batch_entropy = entropy_optimizer.minimize( |
|||
self.entropy_loss, var_list=self.log_ent_coef |
|||
) |
|||
|
|||
def print_all_vars(self, variables): |
|||
for _var in variables: |
|||
logger.debug(_var) |
|||
|
|||
@timed |
|||
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: |
|||
""" |
|||
Updates model using buffer. |
|||
:param num_sequences: Number of trajectories in batch. |
|||
:param batch: Experience mini-batch. |
|||
:param update_target: Whether or not to update target value network |
|||
:param reward_signal_batches: Minibatches to use for updating the reward signals, |
|||
indexed by name. If none, don't update the reward signals. |
|||
:return: Output from update process. |
|||
""" |
|||
feed_dict = self._construct_feed_dict(self.policy, batch, num_sequences) |
|||
stats_needed = self.stats_name_to_update_name |
|||
update_stats: Dict[str, float] = {} |
|||
update_vals = self._execute_model(feed_dict, self.update_dict) |
|||
for stat_name, update_name in stats_needed.items(): |
|||
update_stats[stat_name] = update_vals[update_name] |
|||
# Update target network. By default, target update happens at every policy update. |
|||
self.sess.run(self.target_update_op) |
|||
return update_stats |
|||
|
|||
def update_reward_signals( |
|||
self, reward_signal_minibatches: Mapping[str, AgentBuffer], num_sequences: int |
|||
) -> Dict[str, float]: |
|||
""" |
|||
Only update the reward signals. |
|||
:param reward_signal_batches: Minibatches to use for updating the reward signals, |
|||
indexed by name. If none, don't update the reward signals. |
|||
""" |
|||
# Collect feed dicts for all reward signals. |
|||
feed_dict: Dict[tf.Tensor, Any] = {} |
|||
update_dict: Dict[str, tf.Tensor] = {} |
|||
update_stats: Dict[str, float] = {} |
|||
stats_needed: Dict[str, str] = {} |
|||
if reward_signal_minibatches: |
|||
self.add_reward_signal_dicts( |
|||
feed_dict, |
|||
update_dict, |
|||
stats_needed, |
|||
reward_signal_minibatches, |
|||
num_sequences, |
|||
) |
|||
update_vals = self._execute_model(feed_dict, update_dict) |
|||
for stat_name, update_name in stats_needed.items(): |
|||
update_stats[stat_name] = update_vals[update_name] |
|||
return update_stats |
|||
|
|||
def add_reward_signal_dicts( |
|||
self, |
|||
feed_dict: Dict[tf.Tensor, Any], |
|||
update_dict: Dict[str, tf.Tensor], |
|||
stats_needed: Dict[str, str], |
|||
reward_signal_minibatches: Mapping[str, AgentBuffer], |
|||
num_sequences: int, |
|||
) -> None: |
|||
""" |
|||
Adds the items needed for reward signal updates to the feed_dict and stats_needed dict. |
|||
:param feed_dict: Feed dict needed update |
|||
:param update_dit: Update dict that needs update |
|||
:param stats_needed: Stats needed to get from the update. |
|||
:param reward_signal_minibatches: Minibatches to use for updating the reward signals, |
|||
indexed by name. |
|||
""" |
|||
for name, r_batch in reward_signal_minibatches.items(): |
|||
feed_dict.update( |
|||
self.reward_signals[name].prepare_update( |
|||
self.policy, r_batch, num_sequences |
|||
) |
|||
) |
|||
update_dict.update(self.reward_signals[name].update_dict) |
|||
stats_needed.update(self.reward_signals[name].stats_name_to_update_name) |
|||
|
|||
def _construct_feed_dict( |
|||
self, policy: TFPolicy, batch: AgentBuffer, num_sequences: int |
|||
) -> Dict[tf.Tensor, Any]: |
|||
""" |
|||
Builds the feed dict for updating the SAC model. |
|||
:param model: The model to update. May be different when, e.g. using multi-GPU. |
|||
:param batch: Mini-batch to use to update. |
|||
:param num_sequences: Number of LSTM sequences in batch. |
|||
""" |
|||
# Do an optional burn-in for memories |
|||
num_burn_in = int(self.burn_in_ratio * self.policy.sequence_length) |
|||
burn_in_mask = np.ones((self.policy.sequence_length), dtype=np.float32) |
|||
burn_in_mask[range(0, num_burn_in)] = 0 |
|||
burn_in_mask = np.tile(burn_in_mask, num_sequences) |
|||
feed_dict = { |
|||
policy.batch_size_ph: num_sequences, |
|||
policy.sequence_length_ph: self.policy.sequence_length, |
|||
self.next_sequence_length_ph: self.policy.sequence_length, |
|||
self.policy.mask_input: batch["masks"] * burn_in_mask, |
|||
} |
|||
for name in self.reward_signals: |
|||
feed_dict[self.rewards_holders[name]] = batch["{}_rewards".format(name)] |
|||
|
|||
if self.policy.use_continuous_act: |
|||
feed_dict[self.policy_network.external_action_in] = batch["actions"] |
|||
else: |
|||
# for discriminator |
|||
feed_dict[self.policy_network.external_action_in] = batch["actions"] |
|||
feed_dict[policy.output] = batch["actions"] |
|||
if self.policy.use_recurrent: |
|||
feed_dict[policy.prev_action] = batch["prev_action"] |
|||
feed_dict[policy.action_masks] = batch["action_mask"] |
|||
if self.policy.use_vec_obs: |
|||
feed_dict[policy.vector_in] = batch["vector_obs"] |
|||
feed_dict[self.next_vector_in] = batch["next_vector_in"] |
|||
if self.policy.vis_obs_size > 0: |
|||
for i, _ in enumerate(policy.visual_in): |
|||
_obs = batch["visual_obs%d" % i] |
|||
feed_dict[policy.visual_in[i]] = _obs |
|||
for i, _ in enumerate(self.next_visual_in): |
|||
_obs = batch["next_visual_obs%d" % i] |
|||
feed_dict[self.next_visual_in[i]] = _obs |
|||
if self.policy.use_recurrent: |
|||
feed_dict[policy.memory_in] = [ |
|||
batch["memory"][i] |
|||
for i in range(0, len(batch["memory"]), self.policy.sequence_length) |
|||
] |
|||
feed_dict[self.policy_network.memory_in] = self._make_zero_mem( |
|||
self.m_size, batch.num_experiences |
|||
) |
|||
feed_dict[self.target_network.memory_in] = self._make_zero_mem( |
|||
self.m_size // 3, batch.num_experiences |
|||
) |
|||
feed_dict[self.dones_holder] = batch["done"] |
|||
return feed_dict |
撰写
预览
正在加载...
取消
保存
Reference in new issue