Upgraded to Barracuda 0.2.1, fixes issues with discrete action models

5 年前 · 8a5d17da
--- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Barracuda.dll
+++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Barracuda.dll
--- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/BarracudaReferenceImpl.compute
+++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/BarracudaReferenceImpl.compute
 uint4 _Pool;
 uint4 _Stride;
 float _Alpha;
+float _Beta;
 float _Seed;

 [numthreads(8,8,1)]
    if (X.channels - c == 1)
    {
        // broadcast to all channels
-        v = X.Get(b, y, x, c);
+        v = _Alpha * X.Get(b, y, x, c) + _Beta;
-        v.r = X.Get(b, y, x, c+0);
-        v.g = X.Get(b, y, x, c+1);
-        v.b = X.Get(b, y, x, c+2);
+        v.r = _Alpha * X.Get(b, y, x, c+0) + _Beta;
+        v.g = _Alpha * X.Get(b, y, x, c+1) + _Beta;
+        v.b = _Alpha * X.Get(b, y, x, c+2) + _Beta;
-        v.r = X.Get(b, y, x, c+0);
-        v.g = X.Get(b, y, x, c+1);
-        v.b = X.Get(b, y, x, c+2);
-        v.a = X.Get(b, y, x, c+3);
+        v.r = _Alpha * X.Get(b, y, x, c+0) + _Beta;
+        v.g = _Alpha * X.Get(b, y, x, c+1) + _Beta;
+        v.b = _Alpha * X.Get(b, y, x, c+2) + _Beta;
+        v.a = _Alpha * X.Get(b, y, x, c+3) + _Beta;
-

    Otex2D[dispatchThreadID.xy] = v;
 }
--- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Conv.compute
+++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Conv.compute
 #pragma kernel Conv2D
 #pragma kernel Conv2D_RegisterBlock4x2
 #pragma kernel Conv2D_L1Cached64_RegisterBlock4x4
+#pragma kernel Conv2D_L1Cached32_RegisterBlock4x4

 #pragma kernel DepthwiseConv2D

 #undef SIZE_W
 #undef SIZE_H

-#undef L1CACHESIZE
-#define L1CACHESIZE 64
-#undef SIZE
-#define SIZE 4
-groupshared float Conv2D_L1Cached64_Reg_Loop_safe_X[SIZE*SIZE][L1CACHESIZE];
-[numthreads(L1CACHESIZE, 1, 1)]
-void Conv2D_L1Cached64_RegisterBlock4x4(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    DISPATCH_ARGS(K.kernelCount, O.width, O.height);
-    TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
+#define CONV2D_L1CACHED(L1CACHESIZE, SIZE, FMA) \
+groupshared float Conv2D_L1Cached##L1CACHESIZE##_Reg_Loop_safe_X[SIZE*SIZE][L1CACHESIZE];\
+[numthreads(L1CACHESIZE, 1, 1)]\
+void Conv2D_L1Cached##L1CACHESIZE##_RegisterBlock##SIZE##x##SIZE(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)\
+{\
+    DISPATCH_ARGS(K.kernelCount, O.width, O.height);\
+    TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);\
+\
+    uint k = L1CACHESIZE * groupID.x + groupThreadID.x;\
+    uint x = groupID.y;\
+    uint y = groupID.z;\
+\
+    if (x*SIZE >= O.width) return;\
+    if (y*SIZE >= O.height) return;\
+\
+    for (uint n = 0; n < O.batch; ++n)\
+    {\
+        float acc[SIZE*SIZE];\
+        [unroll]\
+        for (uint q = 0; q < SIZE*SIZE; ++q)\
+            acc[q] = B.SafeGet(k);\
+\
+        for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)\
+        {\
+            for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)\
+            {\
+                uint2 pos[SIZE*SIZE];\
+                [unroll]\
+                for (uint q = 0; q < SIZE*SIZE; ++q)\
+                    pos[q] = uint2(x*SIZE+(q%SIZE), y*SIZE+(q/SIZE)) * _Stride.xy + uint2(dx, dy);\
+\
+                for (uint c = 0; c < X.channels; c += L1CACHESIZE)\
+                {\
+                    uint dc = groupThreadID.x;\
+                    [unroll]\
+                    for (q = 0; q < SIZE*SIZE; ++q)\
+                        Conv2D_L1Cached##L1CACHESIZE##_Reg_Loop_safe_X[q][dc] = X.SafeGet(n, pos[q], c + dc, _Pad.xy);\
+                    GroupMemoryBarrierWithGroupSync();\
+\
+                    if (k < K.channels)\
+                    {\
+                        uint kIndex = K.Index(dy, dx, c, k);\
+                        for (dc = 0; dc < L1CACHESIZE; ++dc)\
+                        {\
+                            [unroll]\
+                            for (q = 0; q < SIZE*SIZE; ++q)\
+                                acc[q] = FMA(Conv2D_L1Cached##L1CACHESIZE##_Reg_Loop_safe_X[q][dc], K.data[kIndex], acc[q]);\
+                            kIndex += K.channels;\
+                        }\
+                    }\
+                    GroupMemoryBarrierWithGroupSync();\
+                }\
+            }\
+        }\
+\
+        uint remainderW = (O.width - x*SIZE);\
+        uint remainderH = (O.height - y*SIZE);\
+\
+        if (k < K.channels)\
+            [unroll]\
+            for (q = 0; q < SIZE*SIZE; ++q)\
+                if (q/SIZE < remainderH && q%SIZE < remainderW)\
+                    O.Set(n, y*SIZE+(q/SIZE), x*SIZE+(q%SIZE), k, acc[q]);\
+    }\
+\
+}
-    #define X_ Conv2D_L1Cached64_Reg_Loop_safe_X
-
-    uint k = L1CACHESIZE * groupID.x + groupThreadID.x;
-    uint x = groupID.y;
-    uint y = groupID.z;
-
-    // need all threads to load channels, thus will do late check against kernel count
-    if (x*SIZE >= O.width) return;
-    if (y*SIZE >= O.height) return;
-
-    for (uint n = 0; n < O.batch; ++n)
-    {
-        float acc[SIZE*SIZE];
-        [unroll]
-        for (uint q = 0; q < SIZE*SIZE; ++q)
-            acc[q] = B.SafeGet(k);
-
-        for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
-        {
-            for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
-            {
-                uint2 pos[SIZE*SIZE];
-                [unroll]
-                for (uint q = 0; q < SIZE*SIZE; ++q)
-                    pos[q] = uint2(x*SIZE+(q%SIZE), y*SIZE+(q/SIZE)) * _Stride.xy + uint2(dx, dy);
-
-                for (uint c = 0; c < X.channels; c += L1CACHESIZE)
-                {
-                    // Cache X
-                    uint dc = groupThreadID.x;
-                    [unroll]
-                    for (q = 0; q < SIZE*SIZE; ++q)
-                        X_[q][dc] = X.SafeGet(n, pos[q], c + dc, _Pad.xy);
-                    GroupMemoryBarrierWithGroupSync();
+CONV2D_L1CACHED(64,4, fastfma)
+CONV2D_L1CACHED(32,4, fastfma)
-                    // X * K
-                    if (k < K.channels) // need all threads to load channels, thus late check against kernel count
-                    {
-                        uint kIndex = K.Index(dy, dx, c, k);
-                        for (dc = 0; dc < L1CACHESIZE && (c + dc) < K.GetKernelDepth(); ++dc)
-                        {
-                            [unroll]
-                            for (q = 0; q < SIZE*SIZE; ++q)
-                                acc[q] = fastfma(X_[q][dc], K.data[kIndex], acc[q]);
-                            kIndex += K.channels;
-                        }
-                    }
-                    GroupMemoryBarrierWithGroupSync();
-                }
-            }
-        }
-
-        uint remainderW = (O.width - x*SIZE);
-        uint remainderH = (O.height - y*SIZE);
-
-        if (k < K.channels) // need all threads to load channels, thus late check against kernel count
-            [unroll]
-            for (q = 0; q < SIZE*SIZE; ++q)
-                if (q/SIZE < remainderH && q%SIZE < remainderW)
-                    O.Set(n, y*SIZE+(q/SIZE), x*SIZE+(q%SIZE), k, acc[q]);
-    }
-
-    #undef X_
-}


 NUMTHREADS((16,4,4), (8,4,4), (4,4,4))
--- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/ReleaseNotes.md
+++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/ReleaseNotes.md
 # Release notes

+## 0.2.1
+- TF importer: fixed ResizeNearestNeighbor aka Upsample2D scaling factor detection.
+- TF importer: optimized node sorting. Should be faster than 0.2.0.
+- TF importer: made detection of actual output node from LSTM/GRU pattern more bullet proof by skipping Const nodes.
+- TF importer: improved InstanceNormalization handling.
+- TF importer: fixed SquareDifference pattern.
+- TF importer: fixed Conv2DBackpropInput (transpose convolution) import. 
+- Fixed Conv2D performance regression on some GPUs.
+- Fixed TextureAsTensorData.Download() to work properly with InterpretDepthAs.Channels.
+- Fixed bug when identity/nop layers would reuse input as an output and later causing premature release of that tensor as part of intermediate data cleanup.
+- Added scale + bias to TenstorToRenderTexture interface, usefull for adjusting network output scale + bias on the fly.
+- Fixed double Dispose issue when worker gets garbage collected.
+
 ## 0.2.0
 - Version bumped to 0.2.0 as it brings breaking API changes, for details look below. 
 - Significantly reduced temporary memory allocations by introducing internal allocator support. Now memory is re-used between layer execution as much as possible.
 - Compatibility with ML Agents models: 3DBall, PushBlock, GridWorld, Soccer.

 ## 0.1.0
- First internal build. Due some bugs encountered wasn't published.
+- First internal build. Due some bugs encountered wasn't published.
+
+#Contributors
+- Renaldas (ReJ) Zioma
+- Mantas Puida
+- Vladimir Oster
+- Martin Sternevald
+- Valdemar Bučilko
+- Kuba Cupisz
+- Povilas Kanapickas
+- Paulius Puodžiūnas
--- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/package.json
+++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/package.json
 {
    "name": "com.unity.barracuda",
    "displayName": "Barracuda",
-    "version": "0.2.0-preview",
+    "version": "0.2.1-preview",
    "unity": "2017.4",
    "description": "Barracuda is lightweight and cross-platform Neural Net inference library. Barracuda supports inference both on GPU and CPU.",
    "dependencies": {}
--- a/ml-agents/mlagents/trainers/barracuda.py
+++ b/ml-agents/mlagents/trainers/barracuda.py
    assert(len(find_missing_inputs(new_model, inputs_and_memories)) == 0)
    return new_model

-
-
 # Trim
 def trim(model, criteria_regexp_string, verbose):
    if hasattr(model, 'layers'):
        model = trim_model(model, preserve_outputs)
    else:
        print("WARNING: Trim couldn't find any layers to match:", criteria_regexp_string)
+    return model
+
+# Fuse
+def fuse(model, verbose):
+    i = 0
+    while i < len(model) - 1:
+        if model[i].type == model[i+1].type and model[i].type == 255: # Load
+            model[i].tensors += model[i+1].tensors
+            del model[i+1]
+        else:
+            i += 1
    return model

 def compress(model):
--- a/ml-agents/mlagents/trainers/tensorflow_to_barracuda.py
+++ b/ml-agents/mlagents/trainers/tensorflow_to_barracuda.py
                    rank = 4,
                    out_shapes = lambda s: [
                        [s[0][0], s[0][1], s[0][3], s[0][2]], # K TF:[H, W, in_channels, channel_multiplier] => [H, W, 1, in_channels]
-                        [1, 1, 1, s[-1][-1]] if len(s) > 1 else
-                        [1, 1, 1, s[0][2]]                    # B
+                        [1, 1, 1, s[-1][-1]] if len(s) > 1
+                        else [1, 1, 1, s[0][2]]               # B
                    ],
                    patch_data = lambda data: [
                        np.transpose(data[0], (0,1,3,2)),
                    id = 22,
                    rank = 4,
-                    out_shapes = lambda shapes: [
-                                                             # NOTE: skip the 0th tensor 
-                                                             #       in Conv2DBackpropInput 0th tensor is 'input_sizes' - which differs from other Conv layers
-                        shapes[1],                           # K
-                        [1, 1, 1, shapes[-1][-1]],           # B
+                    out_shapes = lambda s: [
+                        [s[0][0], s[0][1], s[0][3], s[0][2]], # K TF:[H, W, in_channels, out_channels] => [H, W, out_channels, in_channels]
+                        [1, 1, 1, s[-1][-1]] if len(s) > 1
+                        else [1, 1, 1, s[0][2]]               # B
-                        data[1],
-                        data[2] if len(data) > 2
-                        else np.zeros([1,1,1,np.shape(data[1])[-1]]) # NOTE: since 0th tensor is skipped in out_shapes, bias tensor when missing is not automatically initialized with zeros
+                        np.transpose(data[0], (0,1,3,2)),
+                        data[1]
                    ]),
    'Pad':              29,

                        [data[0], data[1]] if len(data) == 4 else
                        [np.ones(np.shape(data[0])), data[0]]
                    ),
-    'InstanceNormalization': Struct(
+    'InstanceNormalization': Struct( # TODO: epsilon
                    id = 52,
                    out_shapes = lambda shapes: [
                        [1, 1, 1, shapes[0][0]],             # G
    repr(['Add', 'Rsqrt', 'Mul', 'Mul', 'Mul', 'Sub', 'Add'])   : 'BatchNormalization',

    repr(['Mean', 'StopGradient', 'SquaredDifference', 'Mean',
+        'Sub', 'Add', 'Pow', 'RealDiv', 'Mul', 'Add'])          : 'InstanceNormalization_ByTensorOrder',
+    repr(['Mean', 'StopGradient', 'SquaredDifference', 'Mean',
-        'Add', 'Rsqrt', 'Mul', 'Mul', 'Mul', 'Sub', 'Add'])     : 'InstanceNormalization',
+        'Add', 'Rsqrt', 'Mul', 'Mul', 'Mul', 'Sub', 'Add'])     : 'InstanceNormalization_ByTensorName',
+
+    repr(['Conv2DBackpropInput'])                               : 'Conv2DBackpropInput',
+    repr(['Shape', 'StridedSlice', 'StridedSlice', 'StridedSlice', 'Mul',
+        'Mul', 'Pack', 'Conv2DBackpropInput', 'BiasAdd'])       : 'Conv2DBackpropInput',
+    repr(['Shape', 'StridedSlice', 'StridedSlice', 'StridedSlice', 'Mul',
+        'Mul', 'Pack', 'Conv2DBackpropInput'])                  : 'Conv2DBackpropInput',
+
+    repr(['Shape', 'StridedSlice', 'Mul', 'ResizeNearestNeighbor'])
+                                                                : 'ResizeNearestNeighbor',

    repr(['Pack', 'Reshape'])                                   : 'Flatten$',    # for now we assume that this combination is trivial Flatten
                                                                                 # for exmaple it is used in ML-agents LSTM nets with sequence_length==1
            op    = 'BatchNormalization',
            input = [i for i in inputs] +
                order_by([t.name for t in tensors], ['gamma', 'beta', 'mean', 'variance']),
-        ),        
-    'InstanceNormalization' : lambda nodes, inputs, tensors, _:
+        ),
+    'InstanceNormalization_ByTensorName' : lambda nodes, inputs, tensors, _:
-        ),        
+        ),
+    'InstanceNormalization_ByTensorOrder' : lambda nodes, inputs, tensors, _:
+        Struct(
+            op    = 'InstanceNormalization',
+            input = [i for i in inputs] + [t.name for t in tensors][-2:],
+        ),
    'Dense' : lambda nodes, inputs, tensors, _:
        Struct(
            op    = 'Dense',
    'Conv2DBackpropInput' : lambda nodes, inputs, tensors, _:
        Struct(
            op    = 'Conv2DBackpropInput',
-            input = [i for i in inputs] + [t.name for t in tensors],
+            input = [i for i in inputs] + [t.name for t in tensors][1:][-2:],   # [1:]  - skips the 0th tensor, since Conv2DBackpropInput 0th tensor is 'input_sizes' (which differs from other Conv layers)
+                                                                                # [-2:] - take only last 2 tensors, this allows to process large patterns with the same code
+    'ResizeNearestNeighbor' : lambda nodes, inputs, tensors, _:
+        Struct(
+            op    = 'ResizeNearestNeighbor',
+            input = [i for i in inputs],
+            ksize = [int(tensors[0].data[0]), int(tensors[0].data[1])]          if len(tensors) == 1 and len(tensors[0].data) == 2
+                    else [int(tensors[-1].data[0]), int(tensors[-1].data[1])]   if len(tensors) >= 4 and len(tensors[-1].data) == 2
+                    else [1,1]
+        ),
    'Mean' : lambda nodes, inputs, tensors, _:
        # take only the last input
        barracuda.mean(nodes[-1].name, inputs[-1], axis=tensors[0].data),

    'BasicLSTM' : lambda nodes, inputs, tensors, context:
-        basic_lstm(nodes, inputs, tensors, context),
+        basic_lstm(nodes, inputs, tensors, context,
+            index_of_actual_output_node=-3, assert_output_node_op_type='Reshape'),

    'Swish' : lambda nodes, inputs, tensors, _:
        Struct(
 def sqr_diff(name, a, b):
    nn = barracuda.Build(name)
    d = nn.sub(a, b)
-    nn.mul(d, d)
+    nn.mul(d, d, out=name)
    return nn.layers

 def strided_slice(name, input, input_rank, begin, end, strides, begin_mask, end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask):
    nn.strided_slice(input, begin, end, strides, output_rank, out=name)
    return nn.layers

-def gru(nodes, inputs, tensors, context):
+# search backwards starting from index_of_actual_output_node for non-const node
+def locate_actual_output_node(nodes, index_of_actual_output_node=-1, assert_output_node_op_type=None):
+    while (-index_of_actual_output_node-1) < len(nodes) and nodes[index_of_actual_output_node].op == 'Const':
+        index_of_actual_output_node -= 1
+    actual_output_node = nodes[index_of_actual_output_node]
+    assert(actual_output_node.op == assert_output_node_op_type or not assert_output_node_op_type)
+    return actual_output_node
+
+def gru(nodes, inputs, tensors, context, index_of_actual_output_node, assert_output_node_op_type=None):
    assert(len(inputs) == 2)

    def find_tensor_by_name(name, default=None):
    context.model_memories += [state_shape, state, new_state]

    # map exptected output of the replaced pattern to output from our GRU cell
-    actual_output_node = nodes[-4]
-    assert(actual_output_node.op == 'Reshape')
+    actual_output_node = locate_actual_output_node(nodes, index_of_actual_output_node, assert_output_node_op_type)
-def basic_lstm(nodes, inputs, tensors, context):
+def basic_lstm(nodes, inputs, tensors, context, index_of_actual_output_node, assert_output_node_op_type=None):
    assert(len(inputs) == 2)

    def find_tensor_by_name(name, default=None):
    context.model_memories += [state_shape, state_h, new_state_h]

    # map expected output of the replaced pattern to output from our LSTM cell
-    actual_output_node = nodes[-4]
-    assert(actual_output_node.op == 'Reshape')
+    actual_output_node = locate_actual_output_node(nodes, index_of_actual_output_node, assert_output_node_op_type)
    context.map_ignored_layer_to_its_input[actual_output_node.name] = new_state_h

    return new_layers

    # Find node patterns
    nodes_as_array = [node for node in model.node]
-    nodes_as_array = slow_but_stable_topological_sort(nodes_as_array)
+    nodes_as_array = slow_but_stable_topological_sort(nodes_as_array, verbose=True)

    node_index = 0
    while node_index < len(nodes_as_array):
            process_layer(node, o_context, args)
            node_index += 1

-    return o_context.layers, o_context.input_shapes, o_context.model_tensors, o_context.model_memories
+    def find_unconnected_const_nodes(nodes):
+        nodes_with_consts = {node.name: node for node in nodes if node.op == 'Const'}
+        for node in nodes:
+            for i in node.input:
+                nodes_with_consts.pop(i, None)
+        return list(nodes_with_consts.keys())
+
+    return o_context.layers, o_context.input_shapes, o_context.model_tensors, o_context.model_memories, \
+        find_unconnected_const_nodes(nodes_as_array)
-def slow_but_stable_topological_sort(nodes):
+def slow_but_stable_topological_sort(nodes, verbose):
+
+    nodes_with_consts = [node for node in nodes if node.op == 'Const']
+    nodes_for_sorting = [node for node in nodes if node.op != 'Const']
+
+    # TODO: optimize for performance
+    # based on http://blog.gapotchenko.com/stable-topological-sort
+
+    def assign_ids(nodes):
+        ids = []
+        id_by_name = {}
+        id = 0
+        for node in nodes:
+            id_by_name[node.name] = id;
+            ids.append(id)
+            id += 1
+
+        inputs_by_id = [None] * len(nodes)
+        for node in nodes:
+            id = id_by_name[node.name]
+            inputs_by_id[id] = {id_by_name.get(i, -1) for i in node.input}
+
+        return ids, inputs_by_id
+
+    def sort(ids, inputs_by_id, verbose_lambda):
+        sorted = False
+        n = len(ids)
+        while not sorted:
+            sorted = True
+            for i in range(n):
+                for j in range (i):
+                    if ids[i] in inputs_by_id[ids[j]]:
+                        tmp = ids.pop(i)
+                        ids.insert(j, tmp)
+                        sorted = False
+            verbose_lambda(sorted)
+        return ids
+
+    prefix_printed = False
+    def print_status(sorted):
+        nonlocal prefix_printed
+        if not sorted:
+            if not prefix_printed:
+                print('Sorting model, may take a while...', end="", flush=True)
+                prefix_printed = True
+            else:
+                print('.', end="", flush=True)
+        else:
+            if prefix_printed:
+                print(' Done!')
+
+    ids, inputs_by_id = assign_ids(nodes_for_sorting)
+    ids = sort(ids, inputs_by_id, lambda sorted: print_status(sorted) if verbose else None)
+
+
+    assert(len(ids) == len(nodes_for_sorting))
+    assert(len(ids) + len(nodes_with_consts) == len(nodes))
+    return nodes_with_consts + [nodes_for_sorting[id] for id in ids]
+
+def very_slow_but_stable_topological_sort(nodes, verbose):
    # TODO: optimize for performance
    # based on http://blog.gapotchenko.com/stable-topological-sort
    n = len(nodes)
    assert(len(nodes) == n)
    return nodes

-def sort_nodes_by_dependencies(nodes):
-
-    from collections import defaultdict
-
-    # Class to represent a graph 
-    # Taken from: https://www.geeksforgeeks.org/python-program-for-topological-sorting/
-    class Graph: 
-        def __init__(self,vertices): 
-            self.graph = defaultdict(list) #dictionary containing adjacency List 
-            self.V = vertices #No. of vertices 
-      
-        # function to add an edge to graph 
-        def addEdge(self,u,v): 
-            self.graph[u].append(v) 
-      
-        # A recursive function used by topologicalSort 
-        def topologicalSortUtil(self,v,visited,stack): 
-      
-            # Mark the current node as visited. 
-            visited[v] = True
-      
-            # Recur for all the vertices adjacent to this vertex 
-            for i in self.graph[v]: 
-                if visited[i] == False: 
-                    self.topologicalSortUtil(i,visited,stack) 
-      
-            # Push current vertex to stack which stores result 
-            stack.insert(0,v)
-
-        # The function to do Topological Sort. It uses recursive  
-        # topologicalSortUtil() 
-        def topologicalSort(self): 
-            # Mark all the vertices as not visited 
-            visited = [False]*self.V 
-            stack =[] 
-      
-            # Call the recursive helper function to store Topological 
-            # Sort starting from all vertices one by one 
-            for i in range(self.V): 
-                if visited[i] == False: 
-                    self.topologicalSortUtil(i,visited,stack) 
-      
-            return stack
-
-    g = Graph(len(nodes))
-
-    id_by_name = {}
-    id = 0
-    for node in nodes:
-        id_by_name[node.name] = id;
-        id += 1
-
-    for node in nodes:
-        for i in node.input:
-            #if i not in inputs_and_memories:
-            g.addEdge(id_by_name.get(i, -1), id_by_name[node.name])
-
-    sorted_layer_indices = g.topologicalSort()
-    return [nodes[idx] for idx in sorted_layer_indices]
-
-
 #########################################################

 def convert(source_file, target_file, trim_unused_by_output="", verbose=False, compress_f16=False):

    # Convert
    o_model = barracuda.Model()
-    o_model.layers, o_input_shapes, o_model.tensors, o_model.memories = \
+    o_model.layers, o_input_shapes, o_model.tensors, o_model.memories, o_model.globals = \
        process_model(i_model, args)

    # Cleanup unconnected Identities (they might linger after processing complex node patterns like LSTM)
    all_inputs = {i for l in o_model.layers for i in l.inputs}
    embedded_tensors = {t.name for l in o_model.layers for t in l.tensors}

-    # Find global tensors
+    # Trim
+    if trim_unused_by_output:
+        o_model.layers = barracuda.trim(o_model.layers, trim_unused_by_output, args.verbose)
+
+    # Create load layer for constants
-    o_model.globals = [t for t in o_model.tensors if t not in all_inputs and t not in embedded_tensors]
-    #for x in global_tensors:
-    #    shape = dims_to_barracuda_shape(get_tensor_dims(o_model.tensors[x]))    
-    #    o_globals += [Struct(
-    #        name = x,
-    #        shape = shape,
-    #        data = np.reshape(get_tensor_data(o_model.tensors[x]), shape).astype(np.float32))]
-    # Trim
-    if trim_unused_by_output:
-        o_model.layers = barracuda.trim(o_model.layers, trim_unused_by_output, args.verbose)
-
-    # Create load layers for constants
-
        o_l = Struct(
            type        = 255,  # Load
            class_name  = "Const",

    # Sort model so that layer inputs are always ready upfront
    o_model.layers = barracuda.sort(o_model.layers, o_model.inputs, o_model.memories, args.verbose)
+    o_model.layers = barracuda.fuse(o_model.layers, args.verbose)

    # Summary
    barracuda.summary(o_model,