Yamato inference tests (#4066)

* better errors for missing constants * run inference in yamato after training * add extension * debug subprocess args * fix exe path * search for executable * fix dumb bug * -batchmode * fail if inference fails * install tf2onnx on yamato * allow onnx for overrides (expect to fail now) * enable logs * fix commandline arg * catch exception from SetModel and exit * cleanup error message * model artifacts, logs as artifacts, fix pip * don't run onnx * cleanup and comment * update extension handling
4 年前 · 7f0bb6bd
--- a/.yamato/training-int-tests.yml
+++ b/.yamato/training-int-tests.yml
    logs:
      paths:
        - "artifacts/standalone_build.txt"
+        - "artifacts/inference.nn.txt"
+        - "artifacts/inference.onnx.txt"
+        - "artifacts/models/**"
 {% endfor %}
--- a/Project/Assets/ML-Agents/Examples/SharedAssets/Scripts/ModelOverrider.cs
+++ b/Project/Assets/ML-Agents/Examples/SharedAssets/Scripts/ModelOverrider.cs
    /// Utility class to allow the NNModel file for an agent to be overriden during inference.
    /// This is used internally to validate the file after training is done.
    /// The behavior name to override and file path are specified on the commandline, e.g.
-    /// player.exe --mlagents-override-model behavior1 /path/to/model1.nn --mlagents-override-model behavior2 /path/to/model2.nn
+    /// player.exe --mlagents-override-model-directory /path/to/models
    ///
    /// Additionally, a number of episodes to run can be specified; after this, the application will quit.
    /// Note this will only work with example scenes that have 1:1 Agent:Behaviors. More complicated scenes like WallJump
    {
+        HashSet<string> k_SupportedExtensions = new HashSet<string>{"nn", "onnx"};
+        const string k_CommandLineModelOverrideExtensionFlag = "--mlagents-override-model-extension";
        const string k_CommandLineQuitAfterEpisodesFlag = "--mlagents-quit-after-episodes";
        const string k_CommandLineQuitOnLoadFailure = "--mlagents-quit-on-load-failure";

        Dictionary<string, string> m_BehaviorNameOverrides = new Dictionary<string, string>();

        string m_BehaviorNameOverrideDirectory;
+
+        string m_OverrideExtension = "nn";

        // Cached loaded NNModels, with the behavior name as the key.
        Dictionary<string, NNModel> m_CachedModels = new Dictionary<string, NNModel>();
                {
                    m_BehaviorNameOverrideDirectory = args[i + 1].Trim();
                }
+                else if (args[i] == k_CommandLineModelOverrideExtensionFlag && i < args.Length-1)
+                {
+                    m_OverrideExtension = args[i + 1].Trim().ToLower();
+                    var isKnownExtension = k_SupportedExtensions.Contains(m_OverrideExtension);
+                    // Not supported yet - need to update the model loading code to support
+                    var isOnnx = m_OverrideExtension.Equals("onnx");
+                    if (!isKnownExtension || isOnnx)
+                    {
+                        Debug.LogError($"loading unsupported format: {m_OverrideExtension}");
+                        Application.Quit(1);
+#if UNITY_EDITOR
+                        EditorApplication.isPlaying = false;
+#endif
+                    }
+                }
                else if (args[i] == k_CommandLineQuitAfterEpisodesFlag && i < args.Length-1)
                {
                    Int32.TryParse(args[i + 1], out maxEpisodes);
            }
            else if(!string.IsNullOrEmpty(m_BehaviorNameOverrideDirectory))
            {
-                assetPath = Path.Combine(m_BehaviorNameOverrideDirectory, $"{behaviorName}.nn");
+                assetPath = Path.Combine(m_BehaviorNameOverrideDirectory, $"{behaviorName}.{m_OverrideExtension}");
            }

            if (string.IsNullOrEmpty(assetPath))
                return null;
            }

+            // Note - this approach doesn't work for onnx files. Need to replace with
+            // the equivalent of ONNXModelImporter.OnImportAsset()
            var asset = ScriptableObject.CreateInstance<NNModel>();
            asset.modelData = ScriptableObject.CreateInstance<NNModelData>();
            asset.modelData.Value = model;
        /// </summary>
        void OverrideModel()
        {
+            bool overrideOk = false;
+            string overrideError = null;
+
-            if (nnModel == null && m_QuitOnLoadFailure)
+            if (nnModel == null)
-                Debug.Log(
+                overrideError =
-                    $"and that the model file exists"
-                );
+                    $"and that the model file exists";
+            }
+            else
+            {
+                var modelName = nnModel != null ? nnModel.name : "<null>";
+                Debug.Log($"Overriding behavior {behaviorName} for agent with model {modelName}");
+                try
+                {
+                    m_Agent.SetModel(GetOverrideBehaviorName(behaviorName), nnModel);
+                    overrideOk = true;
+                }
+                catch (Exception e)
+                {
+                    overrideError = $"Exception calling Agent.SetModel: {e}";
+                }
+            }
+
+            if (!overrideOk && m_QuitOnLoadFailure)
+            {
+                if(!string.IsNullOrEmpty(overrideError))
+                {
+                    Debug.LogWarning(overrideError);
+                }
-            var modelName = nnModel != null ? nnModel.name : "<null>";
-            Debug.Log($"Overriding behavior {behaviorName} for agent with model {modelName}");
-            // This might give a null model; that's better because we'll fall back to the Heuristic
-            m_Agent.SetModel(GetOverrideBehaviorName(behaviorName), nnModel);

        }
    }
--- a/com.unity.ml-agents/Runtime/Inference/BarracudaModelParamLoader.cs
+++ b/com.unity.ml-agents/Runtime/Inference/BarracudaModelParamLoader.cs
                return failedModelChecks;
            }

+            foreach (var constantName in TensorNames.RequiredConstants)
+            {
+                var tensor = model.GetTensorByName(constantName);
+                if (tensor == null)
+                {
+                    failedModelChecks.Add($"Required constant \"{constantName}\" was not found in the model file.");
+                    return failedModelChecks;
+                }
+            }
+
            var modelApiVersion = (int)model.GetTensorByName(TensorNames.VersionNumber)[0];
            var memorySize = (int)model.GetTensorByName(TensorNames.MemorySize)[0];
            var isContinuousInt = (int)model.GetTensorByName(TensorNames.IsContinuousControl)[0];
--- a/com.unity.ml-agents/Runtime/Inference/TensorNames.cs
+++ b/com.unity.ml-agents/Runtime/Inference/TensorNames.cs
        public const string IsContinuousControl = "is_continuous_control";
        public const string ActionOutputShape = "action_output_shape";
        public const string ActionOutput = "action";
+
+        public static readonly string[] RequiredConstants =
+        {
+            VersionNumber, MemorySize, IsContinuousControl, ActionOutputShape
+        };
    }
 }
--- a/ml-agents/tests/yamato/training_int_tests.py
+++ b/ml-agents/tests/yamato/training_int_tests.py
 import argparse
 import os
+import shutil
+from typing import Any
+    find_executables,
    get_base_path,
    get_base_output_path,
    run_standalone_build,
 )


-def run_training(python_version, csharp_version):
+def run_training(python_version: str, csharp_version: str) -> bool:
    latest = "latest"
    run_id = int(time.time() * 1000.0)
    print(
    nn_file_expected = f"./{output_dir}/{run_id}/3DBall.nn"
+    onnx_file_expected = f"./{output_dir}/{run_id}/3DBall.onnx"
+    frozen_graph_file_expected = f"./{output_dir}/{run_id}/3DBall/frozen_graph_def.pb"
+
-        sys.exit(1)
+        return False

    base_path = get_base_path()
    print(f"Running in base path {base_path}")
        build_returncode = run_standalone_build(base_path)

        if build_returncode != 0:
-            print("Standalone build FAILED!")
-            sys.exit(build_returncode)
+            print(f"Standalone build FAILED! with return code {build_returncode}")
+            return False

        # Now rename the newly-built executable, and restore the old one
        os.rename(full_player_path, final_player_path)
    # and reduce the batch_size and buffer_size enough to ensure an update step happens.
    yaml_out = "override.yaml"
    if python_version:
-        overrides = {"max_steps": 100, "batch_size": 10, "buffer_size": 10}
+        overrides: Any = {"max_steps": 100, "batch_size": 10, "buffer_size": 10}
        override_legacy_config_file(
            python_version, "config/trainer_config.yaml", yaml_out, **overrides
        )
        }
        override_config_file("config/ppo/3DBall.yaml", yaml_out, overrides)

+    env_path = os.path.join(get_base_output_path(), standalone_player_path + ".app")
-        f"mlagents-learn {yaml_out} --force --env="
-        f"{os.path.join(get_base_output_path(), standalone_player_path)} "
+        f"mlagents-learn {yaml_out} --force --env={env_path} "
        f"--run-id={run_id} --no-graphics --env-args -logFile -"
    )  # noqa
    res = subprocess.run(
-    if res.returncode != 0 or not os.path.exists(nn_file_expected):
+    # Save models as artifacts (only if we're using latest python and C#)
+    if csharp_version is None and python_version is None:
+        model_artifacts_dir = os.path.join(get_base_output_path(), "models")
+        os.makedirs(model_artifacts_dir, exist_ok=True)
+        shutil.copy(nn_file_expected, model_artifacts_dir)
+        shutil.copy(onnx_file_expected, model_artifacts_dir)
+        shutil.copy(frozen_graph_file_expected, model_artifacts_dir)
+
+    if (
+        res.returncode != 0
+        or not os.path.exists(nn_file_expected)
+        or not os.path.exists(onnx_file_expected)
+    ):
-        sys.exit(1)
+        return False
+
+    if csharp_version is None and python_version is None:
+        # Use abs path so that loading doesn't get confused
+        model_path = os.path.abspath(os.path.dirname(nn_file_expected))
+        # Onnx loading for overrides not currently supported, but this is
+        # where to add it in when it is.
+        for extension in ["nn"]:
+            inference_ok = run_inference(env_path, model_path, extension)
+            if not inference_ok:
+                return False
-    sys.exit(0)
+    return True
+
+
+def run_inference(env_path: str, output_path: str, model_extension: str) -> bool:
+    start_time = time.time()
+    exes = find_executables(env_path)
+    if len(exes) != 1:
+        print(f"Can't determine the player executable in {env_path}. Found {exes}.")
+        return False
+
+    log_output_path = f"{get_base_output_path()}/inference.{model_extension}.txt"
+
+    exe_path = exes[0]
+    args = [
+        exe_path,
+        "-nographics",
+        "-batchmode",
+        "-logfile",
+        log_output_path,
+        "--mlagents-override-model-directory",
+        output_path,
+        "--mlagents-quit-on-load-failure",
+        "--mlagents-quit-after-episodes",
+        "1",
+        "--mlagents-override-model-extension",
+        model_extension,
+    ]
+    res = subprocess.run(args)
+    end_time = time.time()
+    if res.returncode != 0:
+        print("Error running inference!")
+        print("Command line: " + " ".join(args))
+        subprocess.run(["cat", log_output_path])
+        return False
+    else:
+        print(f"Inference succeeded! Took {end_time - start_time} seconds")
+
+    return True


 def main():
    args = parser.parse_args()

    try:
-        run_training(args.python, args.csharp)
+        ok = run_training(args.python, args.csharp)
+        if not ok:
+            sys.exit(1)
+
    finally:
        # Cleanup - this gets executed even if we hit sys.exit()
        undo_git_checkout()
--- a/ml-agents/tests/yamato/yamato_utils.py
+++ b/ml-agents/tests/yamato/yamato_utils.py
    return res.returncode


+def find_executables(root_dir: str) -> List[str]:
+    """
+    Try to find the player executable. This seems to vary between Unity versions.
+    """
+    ignored_extension = frozenset([".dll", ".dylib", ".bundle"])
+    exes = []
+    for root, _, files in os.walk(root_dir):
+        for filename in files:
+            file_root, ext = os.path.splitext(filename)
+            if ext in ignored_extension:
+                continue
+            file_path = os.path.join(root, filename)
+            if os.access(file_path, os.X_OK):
+                exes.append(file_path)
+    return exes
+
+
 def init_venv(
    mlagents_python_version: str = None, extra_packages: Optional[List[str]] = None
 ) -> str:
        "--upgrade setuptools",
        # TODO build these and publish to internal pypi
        "~/tensorflow_pkg/tensorflow-2.0.0-cp37-cp37m-macosx_10_14_x86_64.whl",
+        "tf2onnx==1.6.1",
    ]
    if mlagents_python_version:
        # install from pypi