# pylint: skip-file
from __future__ import print_function
import numpy as np
import struct  # convert from Python values and C structs
import tensorflow as tf
import re

# import barracuda
# from barracuda import Struct
from mlagents.trainers import barracuda
from mlagents.trainers.barracuda import Struct
from google.protobuf import descriptor
from google.protobuf.json_format import MessageToJson


if __name__ == "__main__":
    # Handle command line argumengts
    args = barracuda.parse_args(
        description="Convert Tensorflow model to Barracuda binary",
        source_extension=".pb",
        help="input Tensorflow serialized .pb file",
    )
    # Te following code can be used as an example of API used from another module
    # convert() is the main entry point for converter
    import tensorflow_to_barracuda as tf2bc

    tf2bc.convert(args.source_file, args.target_file, args.trim_unused_by_output, args)


# TODO: support more than 1 LSTM layer per model - prepend scope to names and inputs
# TODO: support different activation functions in LSTM
# TODO: strip output Identity node, instead patch upstream layer names
# TODO: use ScaleBias and Pow with alpha when input is constant Tensor
# TODO: support all data format types (curretly only NHWC)
# TODO: support all data types (currently only FLOAT, INT32, BOOL)
# TODO: implement FusedResizeAndPadConv2D

# Important ProtoBuf definitions:
#    https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/types.proto
#    https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/tensor.proto
#    https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/node_def.proto
#
# Node descriptions:
#    https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/ops/nn_ops.cc
#    https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/ops/math_ops.cc
#    https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/ops/random_ops.cc
#
# Class doc:
#    https://www.tensorflow.org/api_docs/cc/
#
known_classes = {
    "Dense": Struct(
        id=1,
        rank=2,
        out_shapes=lambda shapes: [
            [shapes[0][0], 1, 1, shapes[0][1]]
            if len(shapes[0]) > 1
            else [1, 1, 1, 1],  # W
            [1, 1, 1, shapes[-1][-1]],  # B
        ],
        patch_data=lambda data: [data[0], data[1]],
    ),
    "MatMul": Struct(
        id=1,
        rank=2,
        out_shapes=lambda shapes: [
            [shapes[0][0], 1, 1, shapes[0][1]],  # W
            [1, 1, 1, shapes[0][1]],  # B
        ],
        patch_data=lambda data: [data[0], np.zeros(np.shape(data[1]))],
    ),
    "BiasAdd": Struct(
        id=51,  # implemented as ScaleBias
        out_shapes=lambda shapes: [
            [1, 1, 1, shapes[0][0]],  # ONE
            [1, 1, 1, shapes[0][0]],  # B
        ],
        patch_data=lambda data: [np.ones(np.shape(data[0])), data[0]],
    ),
    # TODO: NCHW
    "Conv2D": Struct(
        id=20,
        rank=4,
        out_shapes=lambda shapes: [shapes[0], [1, 1, 1, shapes[-1][-1]]],  # K  # B
        patch_data=lambda data: [data[0], data[1]],
    ),
    "DepthwiseConv2dNative": Struct(  # DepthwiseConv2D
        id=21,
        rank=4,
        out_shapes=lambda s: [
            [
                s[0][0],
                s[0][1],
                s[0][3],
                s[0][2],
            ],  # K TF:[H, W, in_channels, channel_multiplier] => [H, W, 1, in_channels]
            [1, 1, 1, s[-1][-1]] if len(s) > 1 else [1, 1, 1, s[0][2]],  # B
        ],
        patch_data=lambda data: [np.transpose(data[0], (0, 1, 3, 2)), data[1]],
    ),
    "Conv2DBackpropInput": Struct(  # Conv2DTranspose
        id=22,
        rank=4,
        out_shapes=lambda s: [
            [
                s[0][0],
                s[0][1],
                s[0][3],
                s[0][2],
            ],  # K TF:[H, W, in_channels, out_channels] => [H, W, out_channels, in_channels]
            [1, 1, 1, s[-1][-1]] if len(s) > 1 else [1, 1, 1, s[0][2]],  # B
        ],
        patch_data=lambda data: [np.transpose(data[0], (0, 1, 3, 2)), data[1]],
    ),
    "Pad": 29,
    # TODO: 3D
    "ResizeNearestNeighbor": 23,  # implemented as Upsample2D
    "ResizeBilinear": 23,  # implemented as Upsample2D
    "ResizeBicubic": 23,  # implemented as Upsample2D
    "MaxPool": 25,
    "AvgPool": 26,
    "GlobalAveragePool": 28,
    "GlobalAvgPool": 28,
    "Activation": 50,
    "BatchNormalization": Struct(
        id=51,  # after fusion implemented as ScaleBias
        out_shapes=lambda shapes: [
            [1, 1, 1, shapes[0][0]],  # S
            [1, 1, 1, shapes[0][0]],  # B
        ],
        patch_data=lambda data:
        # fuse [gamma, beta, mean, var, epsilon] => [scale, bias]
        # TODO: double-check if epsilon is the last data argument and not the 1st?
        barracuda.fuse_batchnorm_weights(data[0], data[1], data[2], data[3], data[4])
        if len(data) == 5
        else
        # fuse [ONE, beta, mean, var, epsilon] => [scale, bias]
        # TODO: double-check if epsilon is the last data argument and not the 1st?
        barracuda.fuse_batchnorm_weights(
            np.ones(np.shape(data[0])), data[0], data[1], data[2], data[3]
        ),
    ),
    "FusedBatchNorm": Struct(
        id=51,  # after fusion implemented as ScaleBias
        out_shapes=lambda shapes: [
            [1, 1, 1, shapes[0][0]],  # S
            [1, 1, 1, shapes[0][0]],  # B
        ],
        patch_data=lambda data, layer:
        # fuse [gamma, beta, mean, var, epsilon] => [scale, bias]
        barracuda.fuse_batchnorm_weights(
            data[0], data[1], data[2], data[3], get_epsilon(layer)
        ),
    ),
    "BatchNormalizationRuntime": Struct(
        id=52,
        out_shapes=lambda shapes: [
            [1, 1, 1, shapes[0][0]],  # G
            [1, 1, 1, shapes[0][0]],  # B
        ],
        patch_data=lambda data: [data[0], data[1]]
        if len(data) == 4
        else [np.ones(np.shape(data[0])), data[0]],
    ),
    "InstanceNormalization": Struct(  # TODO: epsilon
        id=52,
        out_shapes=lambda shapes: [
            [1, 1, 1, shapes[0][0]],  # G
            [1, 1, 1, shapes[0][0]],  # B
        ],
        patch_data=lambda data: [data[0], data[1]]
        if len(data) == 2
        else [np.ones(np.shape(data[0])), data[0]],
    ),
    "LRN": 53,
    "RandomStandardNormal": 64,
    "RandomUniform": 65,
    "Multinomial": Struct(id=66, rank=2),
    "OneHot": Struct(id=67, rank=lambda inputs: inputs[0] + 1),
    # Broadcast ops
    "Add": Struct(id=100, rank=lambda inputs: np.max(inputs)),
    "AddV2": Struct(id=100, rank=lambda inputs: np.max(inputs)),
    "Sub": Struct(id=101, rank=lambda inputs: np.max(inputs)),
    "Mul": Struct(id=102, rank=lambda inputs: np.max(inputs)),
    "RealDiv": Struct(id=103, rank=lambda inputs: np.max(inputs)),
    "Pow": Struct(id=104, rank=lambda inputs: np.max(inputs)),
    "Minimum": Struct(id=110, rank=lambda inputs: np.max(inputs)),
    "Maximum": Struct(id=111, rank=lambda inputs: np.max(inputs)),
    # Reduce ops
    "Max": Struct(id=124, rank=lambda inputs: inputs[0] - 1),
    "Mean": Struct(id=125, rank=lambda inputs: inputs[0] - 1),
    "Min": Struct(id=126, rank=lambda inputs: inputs[0] - 1),
    "Prod": Struct(id=127, rank=lambda inputs: inputs[0] - 1),
    "Sum": Struct(id=128, rank=lambda inputs: inputs[0] - 1),
    "Flatten": Struct(id=200, rank=2),
    "Reshape": 201,
    "Concat": 210,
    "StridedSlice": 211,
    "Nop": 0,
}

requires_runtime_flag = {
    "Dropout": "DropoutRuntime",
    "BatchNormalization": "BatchNormalizationRuntime",
}

known_activations = {
    "Linear": 0,
    "Relu": 1,
    "Softmax": 2,
    "Tanh": 3,
    "Sigmoid": 4,
    "Elu": 5,
    "Relu6": 6,
    "LeakyRelu": 7,
    "Selu": 8,
    "Swish": 9,
    "LogSoftmax": 10,
    "Softplus": 11,
    "Softsign": 12,
    "Abs": 100,
    "Neg": 101,
    "Ceil": 102,
    "Floor": 104,
    "Sqrt": 111,
    "Exp": 113,
    "Log": 114,
    "Acos": 200,
    "Acosh": 201,
    "Asin": 202,
    "Asinh": 203,
    "Atan": 204,
    "Atanh": 205,
    "Cos": 206,
    "Cosh": 207,
    "Sin": 208,
    "Sinh": 209,
    "Tan": 210,
}

known_paddings = {"VALID": [0, 0, 0, 0], "SAME": [-1]}  # SameUpper

supported_data_formats = {"NHWC"}

known_patterns = {
    # TODO: Flatten pattern using namespace regexp
    repr(["Shape", "StridedSlice", "Pack", "Reshape"]): "Flatten",
    repr(["Shape", "StridedSlice", "Prod", "Pack", "Reshape"]): "Flatten",
    repr(
        ["Shape", "Slice", "Slice", "Prod", "ExpandDims", "ConcatV2", "Reshape"]
    ): "Flatten",
    repr(["Add", "Rsqrt", "Mul", "Mul", "Sub", "Add"]): "BatchNormalization",
    repr(["Add", "Rsqrt", "Mul", "Mul", "Mul", "Sub", "Add"]): "BatchNormalization",
    repr(
        [
            "Mean",
            "StopGradient",
            "SquaredDifference",
            "Mean",
            "Sub",
            "Add",
            "Pow",
            "RealDiv",
            "Mul",
            "Add",
        ]
    ): "InstanceNormalization_ByTensorOrder",
    repr(
        [
            "Mean",
            "StopGradient",
            "SquaredDifference",
            "Mean",
            "Squeeze",
            "Squeeze",
            "Add",
            "Rsqrt",
            "Mul",
            "Mul",
            "Mul",
            "Sub",
            "Add",
        ]
    ): "InstanceNormalization_ByTensorName",
    repr(["MatMul", "BiasAdd"]): "Dense",
    repr(["Conv2D", "BiasAdd"]): "Conv2D",
    repr(["DepthwiseConv2dNative", "BiasAdd"]): "DepthwiseConv2dNative",
    repr(["Conv2DBackpropInput", "BiasAdd"]): "Conv2DBackpropInput",
    repr(["Conv2DBackpropInput"]): "Conv2DBackpropInput",
    repr(
        [
            "Shape",
            "StridedSlice",
            "StridedSlice",
            "StridedSlice",
            "Mul",
            "Mul",
            "Pack",
            "Conv2DBackpropInput",
            "BiasAdd",
        ]
    ): "Conv2DBackpropInput",
    repr(
        [
            "Shape",
            "StridedSlice",
            "StridedSlice",
            "StridedSlice",
            "Mul",
            "Mul",
            "Pack",
            "Conv2DBackpropInput",
        ]
    ): "Conv2DBackpropInput",
    repr(
        ["Shape", "StridedSlice", "Mul", "ResizeNearestNeighbor"]
    ): "ResizeNearestNeighbor",
    repr(
        ["Pack", "Reshape"]
    ): "Flatten$",  # for now we assume that this combination is trivial Flatten
    # for exmaple it is used in ML-agents LSTM nets with sequence_length==1
    repr(
        [
            "StridedSlice",
            "Reshape",
            re.compile("^lstm/"),
            "Reshape",
            "ConcatV2",
            "Identity",
        ]
    ): "BasicLSTMReshapeOut",
    repr(
        [re.compile("^lstm/"), "Reshape", "ConcatV2", "Identity"]
    ): "BasicLSTMReshapeOut",
    repr(
        ["Reshape", re.compile("^lstm_[a-z]*/"), "Reshape", "ConcatV2"]
    ): "BasicLSTMReshapeOut",
    repr(["Reshape", re.compile("^lstm_[a-z]*/"), "ConcatV2"]): "BasicLSTMConcatOut",
    repr(["Sigmoid", "Mul"]): "Swish",
    repr(["Mul", "Abs", "Mul", "Add"]): "LeakyRelu",
    repr(
        ["Shape", "Reshape"]
    ): "ReshapeLikeInput0",  # shape comes from the 1st node as input[0]
    repr(["Reshape"]): "Reshape",
    repr(["ConcatV2"]): "ConcatV2",
    repr(["Mean"]): "Mean",
    repr(["Pad"]): "Pad",
    repr(["Multinomial"]): "Multinomial",
    repr(["OneHot"]): "OneHot",
    repr(["Square"]): "Square",
    repr(["SquaredDifference"]): "SquaredDifference",
    repr(["StridedSlice"]): "StridedSlice",
    repr(["Squeeze"]): "Squeeze",
    repr(["ExpandDims"]): "ExpandDims",
    # TODO: FusedResizeAndPadConv2D
}


def by_name(args, name):
    for a in args:
        if a.name.endswith(name):
            return a


def by_op(args, op):
    for a in args:
        if a.op == op:
            return a


def order_by(args, names):
    ordered = []
    arg_count = len(args)
    for name in names:
        ordered += [a for a in args if a.endswith(name)]
        args = [a for a in args if not a.endswith(name)]
    ordered += args  # append what is left
    assert len(ordered) == arg_count
    return ordered


transform_patterns = {
    "Flatten": lambda nodes, inputs, tensors, _: Struct(op="Flatten", input=inputs),
    "Flatten$": lambda nodes, inputs, tensors, _: Struct(
        op="Flatten",
        input=[
            inputs[-1]
        ],  # take only the last input, assume all other arguments are trivial (like sequence_length==1
        # always in ML-agents LSTM nets)
    ),
    "Reshape": lambda nodes, inputs, tensors, context: Struct(
        op="Reshape",
        rank=len(tensors[0].data)
        if len(tensors)
        > 0  # tensor data is treated as reshape coefficient, if not empty
        else context.layer_ranks[inputs[1]]
        if len(inputs) == 2  # otherwise shape of the 2nd input tensor is used
        else -1,
        input=inputs,
        shape=[
            tensors[0].data[0],
            tensors[0].data[1],
            tensors[0].data[2],
            tensors[0].data[3],
        ]
        if len(tensors) > 0 and len(tensors[0].data) == 4
        else [tensors[0].data[0], 1, tensors[0].data[1], tensors[0].data[2]]
        if len(tensors) > 0 and len(tensors[0].data) == 3
        else [tensors[0].data[0], 1, 1, tensors[0].data[1]]
        if len(tensors) > 0 and len(tensors[0].data) == 2
        else [1, 1, 1, tensors[0].data[0]]
        if len(tensors) > 0 and len(tensors[0].data) == 1
        else [],
    ),
    "ReshapeLikeInput0": lambda nodes, inputs, tensors, context: Struct(
        op="Reshape",
        rank=context.layer_ranks[inputs[0]]
        if len(inputs)
        == 2  # unlike standard 'Reshape' input[0] is used as shape & input[1] as data
        else -1,
        input=[inputs[1], inputs[0]]
        if len(inputs)
        == 2  # unlike standard 'Reshape' input[0] is used as shape & input[1] as data
        else inputs,
    ),
    "Pad": lambda nodes, inputs, tensors, _: Struct(
        op="Pad"
        if (
            len(tensors) > 0
            and np.shape(tensors[0]) == [4, 2]
            and get_attr(nodes[-1], "mode", default="constant").lower() == "constant"
        )
        else "BarracudaUnsupportedPad",
        input=inputs,
        pads=[
            tensors[0].data[1, 0],
            tensors[0].data[1, 1],
            tensors[0].data[2, 0],
            tensors[0].data[2, 1],
        ]
        if len(tensors) > 0 and np.shape(tensors[0]) == [4, 2]
        else [0, 0, 0, 0],
        beta=get_attr(nodes[-1], "constant_values") or 0,
    ),
    "Squeeze": lambda nodes, inputs, tensors, context: Struct(
        op="Nop",  # Squeeze is no-operation in Barracuda
        input=inputs,
        rank=context.layer_ranks[inputs[0]] - len(get_attr(nodes[-1], "squeeze_dims"))
        if len(get_attr(nodes[-1], "squeeze_dims")) > 0
        else -1,  # if list of squeeze axis is not specified, it is unknown what would be the rank of result
    ),
    "ExpandDims": lambda nodes, inputs, tensors, context: Struct(
        op="Nop",  # ExpandDims is no-operation in Barracuda
        input=[inputs[0]],
        rank=context.layer_ranks[inputs[0]] + 1,
    ),
    "Multinomial": lambda nodes, inputs, tensors, _: Struct(
        op="Multinomial",
        input=inputs,
        shape=[int(by_name(tensors, "/num_samples").data[0])],
        # seed = get_attr(nodes[0], 'seed'),
    ),
    "OneHot": lambda nodes, inputs, tensors, _: Struct(
        op="OneHot",
        input=inputs,
        shape=[int(by_name(tensors, "/depth").data[0])],
        alpha=by_name(tensors, "/on_value").data[0],
        beta=by_name(tensors, "/off_value").data[0],
    ),
    "Square": lambda nodes, inputs, tensors, _: Struct(
        op="Mul", input=[inputs[0], inputs[0]]  # input * input
    ),
    "ConcatV2": lambda nodes, inputs, tensors, context: Struct(
        op="Concat",
        input=inputs,
        axis=axis_to_barracuda(
            int(by_name(tensors, "/axis").data[0]), context.layer_ranks[inputs[0]]
        ),
    ),
    "StridedSlice": lambda nodes, inputs, tensors, context: strided_slice(
        nodes[-1].name,
        inputs[0],
        context.layer_ranks[inputs[0]],
        begin=tensors[0].data,
        end=tensors[1].data,
        strides=tensors[2].data,
        begin_mask=get_attr(nodes[-1], "begin_mask"),
        end_mask=get_attr(nodes[-1], "end_mask"),
        ellipsis_mask=get_attr(nodes[-1], "ellipsis_mask"),
        new_axis_mask=get_attr(nodes[-1], "new_axis_mask"),
        shrink_axis_mask=get_attr(nodes[-1], "shrink_axis_mask"),
    ),
    "BatchNormalization": lambda nodes, inputs, tensors, _: Struct(
        op="BatchNormalization",
        input=[i for i in inputs]
        + order_by([t.name for t in tensors], ["gamma", "beta", "mean", "variance"]),
    ),
    "InstanceNormalization_ByTensorName": lambda nodes, inputs, tensors, _: Struct(
        op="InstanceNormalization",
        input=[i for i in inputs]
        + order_by([t.name for t in tensors], ["scale", "offset"]),
    ),
    "InstanceNormalization_ByTensorOrder": lambda nodes, inputs, tensors, _: Struct(
        op="InstanceNormalization",
        input=[i for i in inputs] + [t.name for t in tensors][-2:],
    ),
    "Dense": lambda nodes, inputs, tensors, _: Struct(
        op="Dense",
        input=[i for i in inputs] + [t.name for t in tensors],
        data_frmt=get_attr(
            by_op(nodes, "Dense") or by_op(nodes, "MatMul"), "data_format"
        ),
    ),
    "Conv2D": lambda nodes, inputs, tensors, _: Struct(
        op="Conv2D",
        input=[i for i in inputs] + [t.name for t in tensors],
        padding=get_attr(by_op(nodes, "Conv2D"), "padding"),
        strides=get_attr(by_op(nodes, "Conv2D"), "strides"),
        dilations=get_attr(by_op(nodes, "Conv2D"), "dilations"),
        data_frmt=get_attr(by_op(nodes, "Conv2D"), "data_format"),
    ),
    "DepthwiseConv2dNative": lambda nodes, inputs, tensors, _: Struct(
        op="DepthwiseConv2dNative",
        input=[i for i in inputs] + [t.name for t in tensors],
        padding=get_attr(by_op(nodes, "DepthwiseConv2dNative"), "padding"),
        strides=get_attr(by_op(nodes, "DepthwiseConv2dNative"), "strides"),
        dilations=get_attr(by_op(nodes, "DepthwiseConv2dNative"), "dilations"),
        data_frmt=get_attr(by_op(nodes, "DepthwiseConv2dNative"), "data_format"),
    ),
    "Conv2DBackpropInput": lambda nodes, inputs, tensors, _: Struct(
        op="Conv2DBackpropInput",
        input=[i for i in inputs]
        + [t.name for t in tensors][1:][
            -2:
        ],  # [1:]  - skips the 0th tensor, since Conv2DBackpropInput 0th tensor is 'input_sizes'
        # (which differs from other Conv layers)
        # [-2:] - take only last 2 tensors, this allows to process large patterns with the same code
        padding=get_attr(by_op(nodes, "Conv2DBackpropInput"), "padding"),
        strides=get_attr(by_op(nodes, "Conv2DBackpropInput"), "strides"),
        dilations=get_attr(by_op(nodes, "Conv2DBackpropInput"), "dilations"),
        data_frmt=get_attr(by_op(nodes, "Conv2DBackpropInput"), "data_format"),
    ),
    "ResizeNearestNeighbor": lambda nodes, inputs, tensors, _: Struct(
        op="ResizeNearestNeighbor",
        input=[i for i in inputs],
        ksize=[int(tensors[0].data[0]), int(tensors[0].data[1])]
        if len(tensors) == 1 and len(tensors[0].data) == 2
        else [int(tensors[-1].data[0]), int(tensors[-1].data[1])]
        if len(tensors) >= 4 and len(tensors[-1].data) == 2
        else [1, 1],
    ),
    "Mean": lambda nodes, inputs, tensors, _:
    # take only the last input
    barracuda.mean(nodes[-1].name, inputs[-1], axis=tensors[0].data),
    "SquaredDifference": lambda nodes, inputs, tensors, _: sqr_diff(
        nodes[-1].name, inputs[0], inputs[1]
    ),
    "BasicLSTMReshapeOut": lambda nodes, inputs, tensors, context: basic_lstm(
        nodes, inputs, tensors, context, find_type="Reshape"
    ),
    "BasicLSTMConcatOut": lambda nodes, inputs, tensors, context: basic_lstm(
        nodes, inputs, tensors, context, find_type="ConcatV2"
    ),
    "Swish": lambda nodes, inputs, tensors, _: Struct(op="Swish", input=inputs),
    "LeakyRelu": lambda nodes, inputs, tensors, _: Struct(op="LeakyRelu", input=inputs),
    # TODO:'Round'
    # TODO:'Rsqrt'
}


# Debug
def debug(s):
    print(s)
    return s


# Helper
def embody(v, default=0):
    return default if v is None else v


# Parse
def get_attr(node, attr_name, default=None):
    if type(node) == Struct:
        if hasattr(node, attr_name):
            return getattr(node, attr_name)
        else:
            return default

    # See: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/attr_value.proto
    val = node.attr[attr_name]

    if val.HasField("list"):
        return val.list.i
        # NOTE: can't find way to identify type of list BUT it is almost always list(int)
        # except list(float) in FractionalAvg/MaxPool
    if val.HasField("b"):
        return val.b
    if val.HasField("i"):
        return val.i
    if val.HasField("f"):
        return val.f
    if val.HasField("s"):
        return val.s.decode("utf-8")
    if val.HasField("shape"):
        return val.shape
    if val.HasField("tensor"):
        return val.tensor
    return default


def get_epsilon(layer):
    return get_attr(
        layer, "epsilon", default=0.001
    )  # default epsilon taken from tf.layers.batch_normalization


def get_layer_rank(layer):
    shape = get_attr(layer, "shape")
    if not shape:
        return None
    if isinstance(shape, list):
        return 1
    shape = [dim.size for dim in shape.dim]
    return len(shape)


def get_layer_shape(layer):
    shape = get_attr(layer, "shape")
    if not shape:
        return [-1, -1, -1, -1]
    shape = [dim.size for dim in shape.dim]
    if len(shape) == 1:
        return [1, 1, 1, shape[0]]
    if len(shape) == 2:
        return [shape[0], 1, 1, shape[1]]
    if len(shape) == 3:
        return [shape[0], 1, shape[1], shape[2]]
    return shape


def get_tensor_dims(tensor):
    if isinstance(tensor, np.ndarray):
        return np.shape(tensor)

    dims = []
    if tensor.tensor_shape:
        dims = [v.size for v in tensor.tensor_shape.dim]
    if tensor.float_val:
        dims = np.shape(tensor.float_val)
    if tensor.int_val:
        dims = np.shape(tensor.int_val)
    if tensor.bool_val:
        dims = np.shape(tensor.bool_val)
    return dims


def get_tensor_dtype(tensor):
    if isinstance(tensor, np.ndarray):
        return tensor.dtype

    dataType = ""
    fields = tensor.ListFields()

    for field, value in fields:
        if (
            field.name == "dtype"
            and field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_ENUM
        ):
            dataType = field.enum_type.values_by_number.get(value, None).name

    return dataType


def get_tensor_data(tensor):
    if isinstance(tensor, np.ndarray):
        return tensor.astype(float)

    dims = get_tensor_dims(tensor)
    elems = np.product(dims)

    if tensor.tensor_content:
        # TODO: support other types
        dataType = get_tensor_dtype(tensor)
        if dataType == "DT_FLOAT":
            data = struct.unpack("<" + str(elems) + "f", tensor.tensor_content)
        elif dataType == "DT_INT32":
            data = struct.unpack("<" + str(elems) + "i", tensor.tensor_content)
        elif dataType == "DT_BOOL":
            data = struct.unpack("<" + str(elems) + "?", tensor.tensor_content)
        else:
            print("UNSUPPORTED: data type", dataType)
    if tensor.float_val:
        data = tensor.float_val
    if tensor.int_val:
        data = np.array(tensor.int_val, dtype=float)
    if tensor.bool_val:
        data = np.array(tensor.bool_val, dtype=float)
    return np.array(data).reshape(dims)


def flatten(items, enter=lambda x: isinstance(x, list)):
    # http://stackoverflow.com/a/40857703
    # https://github.com/ctmakro/canton/blob/master/canton/misc.py
    """Yield items from any nested iterable; see REF."""
    for x in items:
        if enter(x):
            yield from flatten(x)
        else:
            yield x


def replace_strings_in_list(array_of_strigs, replace_with_strings):
    "A value in replace_with_strings can be either single string or list of strings"
    potentially_nested_list = [
        replace_with_strings.get(s) or s for s in array_of_strigs
    ]
    return list(flatten(potentially_nested_list))


def remove_duplicates_from_list(array):
    "Preserves the order of elements in the list"
    output = []
    unique = set()
    for a in array:
        if a not in unique:
            unique.add(a)
            output.append(a)
    return output


#########################################################


def pool_to_HW(shape, data_frmt):
    """ Convert from NHWC|NCHW => HW
    """
    if len(shape) != 4:
        return shape  # Not NHWC|NCHW, return as is
    if data_frmt == "NCHW":
        return [shape[2], shape[3]]
    return [shape[1], shape[2]]


def strides_to_HW(shape, format):
    return pool_to_HW(shape, format)


def axis_to_barracuda(axis, input_rank):
    N = 0
    H = 1
    W = 2
    C = 3
    if axis < 0:
        axis = input_rank - axis
    assert axis >= 0
    assert axis < input_rank
    if input_rank == 4:
        # [NHWC]
        return [N, H, W, C][axis]
    if input_rank == 3:
        # [N_WC]
        return [N, W, C][axis]
    elif input_rank == 2:
        # [N__C]
        return [N, C][axis]
    elif input_rank == 1:
        # [___C]
        return [C][axis]
    return -1


#########################################################


def sqr_diff(name, a, b):
    nn = barracuda.Build(name)
    d = nn.sub(a, b)
    nn.mul(d, d, out=name)
    return nn.layers


def strided_slice(
    name,
    input,
    input_rank,
    begin,
    end,
    strides,
    begin_mask,
    end_mask,
    ellipsis_mask,
    new_axis_mask,
    shrink_axis_mask,
):
    assert input_rank != -1
    begin = begin.astype(np.int32).tolist()
    end = end.astype(np.int32).tolist()
    strides = strides.astype(np.int32).tolist()

    # StridedSlice range and mask descriptions:
    #   https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/strided-slice
    # TODO: I don't think elipsis and newaxis would work together well with current implementation

    assert len(begin) == len(end)
    assert len(begin) == len(strides)

    # prepare begin, end, stride arrays
    output_rank = input_rank
    insert_pos = 0
    while ellipsis_mask:
        ellipsis_mask >>= 1
        insert_pos += 1

    # NOTE: begin=0, end=0, stride=1  <=  full range from existing axis
    #       begin=0, end=0, stride=0  <=  new axis OR shrink axis to single 1st element
    #       begin=N, end=N, stride=0  <=              shrink axis to single Nth element
    while len(begin) < input_rank:
        if insert_pos:
            begin.insert(insert_pos, 0)
            end.insert(insert_pos, 0)
            strides.insert(insert_pos, 1)
        else:
            begin.append(0)
            end.append(0)
            strides.append(1)
    assert len(begin) <= input_rank

    descriptor_count = input_rank
    for i in range(len(begin)):
        if begin_mask & (1 << i):
            begin[i] = 0
        if end_mask & (1 << i):
            end[i] = 0
        if new_axis_mask & (1 << i):
            begin[i] = end[i] = strides[i] = 0
            output_rank += 1
        if shrink_axis_mask & (1 << i):
            end[i] = begin[i]
            strides[i] = 0
            output_rank -= 1

    # convert to Barracuda layout
    descriptor_count = len(begin)
    assert descriptor_count <= 4
    if descriptor_count == 3:
        begin = [begin[0], 0, begin[1], begin[2]]
        end = [end[0], 0, end[1], end[2]]
        strides = [strides[0], 1, strides[1], strides[2]]
    elif descriptor_count == 2:
        begin = [begin[0], 0, 0, begin[1]]
        end = [end[0], 0, 0, end[1]]
        strides = [strides[0], 1, 1, strides[1]]
    elif descriptor_count == 1:
        begin = [0, 0, 0, begin[0]]
        end = [0, 0, 0, end[0]]
        strides = [1, 1, 1, strides[0]]

    nn = barracuda.Build(name)
    nn.strided_slice(input, begin, end, strides, output_rank, out=name)
    return nn.layers


# search backwards starting from index_of_actual_output_node for non-const node
def locate_actual_output_node(
    nodes, index_of_actual_output_node=-1, find_type="Reshape"
):
    while (-index_of_actual_output_node - 1) < len(nodes) and nodes[
        index_of_actual_output_node
    ].op != find_type:
        index_of_actual_output_node -= 1
    actual_output_node = nodes[index_of_actual_output_node]
    assert -index_of_actual_output_node < len(nodes)
    return actual_output_node


def gru(
    nodes,
    inputs,
    tensors,
    context,
    index_of_actual_output_node,
    assert_output_node_op_type=None,
):
    assert len(inputs) == 2

    def find_tensor_by_name(name, default=None):
        nonlocal tensors
        candidates = [t for t in tensors if t.name.endswith(name)]
        return candidates[0].data if candidates else default

    input = inputs[-1]
    state = inputs[0]
    gates_kernel = find_tensor_by_name("/gates/kernel")
    gates_bias = find_tensor_by_name(
        "/gates/bias", default=np.zeros(np.shape(gates_kernel)[-1])
    )
    candidate_kernel = find_tensor_by_name("/candidate/kernel")
    candidate_bias = find_tensor_by_name(
        "/candidate/bias", default=np.zeros(np.shape(candidate_kernel)[-1])
    )
    new_state = nodes[-1].name + "_h"

    assert np.shape(gates_kernel)[-1] == np.shape(gates_bias)[-1]
    assert np.shape(candidate_kernel)[-1] == np.shape(candidate_bias)[-1]

    num_gates = 2
    seq_length = 1
    hidden_size = np.shape(gates_kernel)[-1] // num_gates

    gate_kernels = np.split(gates_kernel, num_gates, axis=-1)
    gate_biases = np.split(gates_bias, num_gates, axis=-1)

    context.model_tensors["kernel_r"] = gate_kernels[0]
    context.model_tensors["kernel_u"] = gate_kernels[1]
    context.model_tensors["kernel_c"] = candidate_kernel
    context.model_tensors["bias_r"] = gate_biases[0]
    context.model_tensors["bias_u"] = gate_biases[1]
    context.model_tensors["bias_c"] = candidate_bias

    context.layer_ranks[state] = 2

    new_layers = barracuda.gru(
        "gru",
        input,
        state,
        "kernel_r",
        "kernel_u",
        "kernel_c",
        "bias_r",
        "bias_u",
        "bias_c",
        new_state,
    )

    state_shape = [1, 1, seq_length, hidden_size]
    context.model_memories += [state_shape, state, new_state]

    # map exptected output of the replaced pattern to output from our GRU cell
    actual_output_node = locate_actual_output_node(
        nodes, index_of_actual_output_node, assert_output_node_op_type
    )
    context.map_ignored_layer_to_its_input[actual_output_node.name] = new_state

    return new_layers


def basic_lstm(nodes, inputs, tensors, context, find_type="Reshape"):
    assert len(inputs) == 2

    def find_tensor_by_name(name, default=None):
        nonlocal tensors
        candidates = [t for t in tensors if t.name.endswith(name)]
        return candidates[0].data if candidates else default

    def find_forget_bias():
        nonlocal nodes
        nonlocal tensors
        # TODO: make it more fault-tolerant
        # search for scalar float constant that is input to Add node
        # and hope it is not a constant for some complex activation function
        for t in tensors:
            if np.prod(t.shape) == 1 and get_tensor_dtype(t.obj) == "DT_FLOAT":
                for n in nodes:
                    if n.op == "Add" and t.name in n.input:
                        return t.data
        return np.zeros(1)

    input = inputs[-1]
    state_c = inputs[0] + "_c"
    state_h = inputs[0] + "_h"
    kernel = find_tensor_by_name("/kernel")
    bias = find_tensor_by_name("/bias", default=np.zeros(np.shape(kernel)[-1]))
    forget_bias = find_forget_bias()
    new_state_c = nodes[-1].name + "_c"
    new_state_h = nodes[-1].name + "_h"

    assert np.shape(kernel)[-1] == np.shape(bias)[-1]

    num_gates = 4
    seq_length = 1
    hidden_size = np.shape(kernel)[-1] // num_gates

    kernels = np.split(kernel, num_gates, axis=-1)
    biases = np.split(bias, num_gates, axis=-1)

    context.model_tensors["kernel_i"] = kernels[0]
    context.model_tensors["kernel_j"] = kernels[1]
    context.model_tensors["kernel_f"] = kernels[2]
    context.model_tensors["kernel_o"] = kernels[3]
    context.model_tensors["bias_i"] = biases[0]
    context.model_tensors["bias_j"] = biases[1]
    context.model_tensors["bias_f"] = biases[2] + forget_bias
    context.model_tensors["bias_o"] = biases[3]

    context.layer_ranks[state_c] = 2
    context.layer_ranks[state_h] = 2

    # lstm_value/strided_slice/stack => lstm_value
    lstm_name = next(i.name for i in nodes if i.name.startswith("lstm")).split("/")[0]

    new_layers = barracuda.lstm(
        lstm_name,
        input,
        state_c,
        state_h,
        "kernel_i",
        "kernel_j",
        "kernel_f",
        "kernel_o",
        "bias_i",
        "bias_j",
        "bias_f",
        "bias_o",
        new_state_c,
        new_state_h,
    )

    state_shape = [1, 1, seq_length, hidden_size]
    context.model_memories += [state_shape, state_c, new_state_c]
    context.model_memories += [state_shape, state_h, new_state_h]

    # map expected output of the replaced pattern to output from our LSTM cell
    actual_output_node = locate_actual_output_node(nodes, -1, find_type)
    concat_out_node = locate_actual_output_node(nodes, -1, "ConcatV2")
    context.map_ignored_layer_to_its_input[actual_output_node.name] = new_state_h
    context.map_ignored_layer_to_its_input[concat_out_node.name] = new_state_c

    return new_layers


#########################################################


def process_layer(layer, context, args):
    model_tensors = context.model_tensors
    input_shapes = context.input_shapes
    layer_ranks = context.layer_ranks
    map_ignored_layer_to_its_input = context.map_ignored_layer_to_its_input

    name = layer.name
    class_name = layer.op
    inputs = (
        layer.input
    )  # Tensorflow inputs are always explicit, but in case of Keras we had 'inputs = layer.input or [prev_layer_name]'
    inputs = replace_strings_in_list(inputs, map_ignored_layer_to_its_input)

    if class_name == "Nop":
        assert len(inputs) <= 1
        map_ignored_layer_to_its_input[name] = inputs
        return

    if class_name == "Const":
        model_tensors[name] = layer.attr["value"].tensor
        layer_ranks[name] = (
            get_layer_rank(layer) or 1
        )  # we treast constants without shape as rank=1 (scalar converted to tensor)
        return

    if class_name == "Placeholder":
        assert inputs == []
        map_ignored_layer_to_its_input[name] = inputs
        input_shapes[name] = get_layer_shape(layer)
        layer_ranks[name] = get_layer_rank(layer)
        return

    if class_name == "Identity":
        connected_to_const = len(inputs) == 1 and inputs[0] in model_tensors
        if connected_to_const:
            map_ignored_layer_to_its_input[name] = inputs
            return
        else:
            # treat Identity layer that are connected to processing nodes
            # as output from the network
            class_name = "Linear"

    if args.print_layers or args.verbose:
        var_tensors = [i for i in inputs if i not in model_tensors]
        const_tensors = [i for i in inputs if i in model_tensors]
        print(
            "'%s' %s Vars:%s Const:%s" % (name, class_name, var_tensors, const_tensors)
        )

    if class_name in known_activations:
        activation = class_name
        class_name = "Activation"
    else:
        activation = "Linear"

    if class_name not in known_classes:
        if class_name in requires_runtime_flag:
            print("SKIP:", class_name, "layer is used only for training")
        else:
            print("IGNORED:", class_name, "unknown layer")
        map_ignored_layer_to_its_input[name] = inputs
        return

    klass = known_classes[class_name]
    if type(klass) == int:
        klass = Struct(id=klass)

    o_l = Struct()
    o_l.type = klass.id
    o_l.class_name = class_name
    o_l.name = name

    auto_pad = get_attr(layer, "padding")  # layer.attr['padding'].s.decode("utf-8")
    pads = get_attr(layer, "pads")
    strides = get_attr(layer, "strides")  # layer.attr['strides'].list.i
    pool_size = get_attr(layer, "ksize")  # layer.attr['ksize'].list.i
    shape = get_attr(layer, "shape")
    starts = get_attr(layer, "starts")
    ends = get_attr(layer, "ends")
    slice_strides = get_attr(layer, "slice_strides")
    rank = get_attr(layer, "rank") or get_layer_rank(layer)
    data_frmt = get_attr(
        layer, "data_format"
    )  # layer.attr['data_format'].s.decode("utf-8")
    axis = get_attr(layer, "axis")
    alpha = get_attr(layer, "alpha", default=1)
    beta = get_attr(layer, "beta")

    if activation and activation not in known_activations:
        print("IGNORED: unknown activation", activation)
    if auto_pad and auto_pad not in known_paddings:
        print("IGNORED: unknown padding", auto_pad)
    if data_frmt and data_frmt not in supported_data_formats:
        print("UNSUPPORTED: data format", data_frmt)

    o_l.activation = known_activations.get(activation) or 0
    o_l.pads = (
        known_paddings.get(auto_pad) if auto_pad else pads or starts or [0, 0, 0, 0]
    )
    o_l.strides = strides_to_HW(strides, data_frmt) if strides else slice_strides or []
    o_l.pool_size = (
        pool_to_HW(pool_size, data_frmt) if pool_size else ends or shape or []
    )
    o_l.axis = embody(axis, default=-1)
    o_l.alpha = embody(alpha, default=1)
    o_l.beta = beta or 0
    o_l.rank = (
        -1
    )  # default initialization, actual value will be set later on in this function

    tensor_names = [i for i in inputs if i in model_tensors]
    o_l.tensors = [
        Struct(
            name=x,
            shape=get_tensor_dims(model_tensors[x]),
            data=get_tensor_data(model_tensors[x]),
        )
        for x in tensor_names
    ]
    # Patch shapes & data
    layer_has_model_tensors = len(o_l.tensors) > 0
    if hasattr(klass, "out_shapes") and layer_has_model_tensors:
        shapes = klass.out_shapes([x.shape for x in o_l.tensors])

        # if we have more shapes than actual tensors,
        # then create & fill missing tensors with zeros
        in_tensor_num = len(o_l.tensors)
        for index, new_shape in enumerate(shapes):
            if index >= in_tensor_num:
                new_tensor = Struct(
                    name=("%s/patch:%i") % (name, index - in_tensor_num),
                    shape=new_shape,
                    data=np.zeros(new_shape),
                )
                o_l.tensors.append(new_tensor)
        assert len(shapes) <= len(o_l.tensors)

        if hasattr(klass, "patch_data"):
            data = [x.data for x in o_l.tensors]

            patch_data_fn = klass.patch_data
            patch_data_expected_arg_count = patch_data_fn.__code__.co_argcount
            patch_data_args = (
                (data, layer) if patch_data_expected_arg_count > 1 else (data,)
            )
            tensor_data = patch_data_fn(*patch_data_args)
            o_l.tensors = o_l.tensors[
                : len(tensor_data)
            ]  # resize tensor array to match patched data - patching might reduce number of tensors
            for x, data in zip(o_l.tensors, tensor_data):
                x.data = data

        # after this point we should have equal amount of shapes and tensors
        assert len(o_l.tensors) == len(shapes)

        for x, shape in zip(o_l.tensors, shapes):
            assert x.data.size == np.prod(shape)
            x.shape = shape

        o_l.inputs = [i for i in inputs if i not in model_tensors]

    else:
        # no 'patch_data' lambda was specifiowned, op does not require tensor args
        o_l.tensors = []
        o_l.inputs = inputs

    # Force all tensors to float32
    for x in o_l.tensors:
        x.data = x.data.astype(np.float32)

    input_ranks = [layer_ranks.get(i, -1) for i in o_l.inputs]
    for i in o_l.inputs:
        if i not in layer_ranks and "lstm" not in i:
            print("WARNING: rank unknown for tensor", i, "while processing node", name)
    if hasattr(klass, "rank"):
        rank = klass.rank
        if hasattr(rank, "__call__"):
            assert (
                -1 not in input_ranks
            )  # for rank() lambda all input ranks have to be known (not -1)
            rank = rank(input_ranks)
    if rank is None:

        def all_elements_equal(arr):  # http://stackoverflow.com/q/3844948/
            return arr.count(arr[0]) == len(arr)

        assert len(input_ranks) > 0
        assert all_elements_equal(input_ranks)
        rank = input_ranks[0]
    layer_ranks[name] = rank
    o_l.rank = rank

    # Layer is ready
    context.layers.append(o_l)


class ModelBuilderContext:
    def __init__(self):
        self.layers = []
        self.input_shapes = {}
        self.model_tensors = {}
        self.model_memories = []
        self.layer_ranks = {}
        self.map_ignored_layer_to_its_input = {}


def process_model(model, args):
    o_context = ModelBuilderContext()

    # Find node patterns
    nodes_as_array = [node for node in model.node]
    nodes_as_array = slow_but_stable_topological_sort(nodes_as_array, verbose=True)

    node_index = 0
    while node_index < len(nodes_as_array):
        node = nodes_as_array[node_index]
        match = False
        for pattern_repr, pattern_name in known_patterns.items():
            pattern = eval(pattern_repr)
            if node_index + len(pattern) > len(nodes_as_array):
                continue  # pattern too long, skip

            require_exact_match = pattern[0] == "Const" or pattern[0] == "Identity"
            pattern_end = node_index

            def match_node(node, pattern):
                return node.op == pattern or (
                    hasattr(pattern, "match") and pattern.match(node.name)
                )

            for p in pattern:
                if not require_exact_match:
                    while (
                        pattern_end < len(nodes_as_array)
                        and nodes_as_array[pattern_end].op != p
                        and (
                            nodes_as_array[pattern_end].op == "Const"
                            or nodes_as_array[pattern_end].op == "Identity"
                        )
                    ):
                        pattern_end += 1
                if pattern_end >= len(nodes_as_array):
                    break

                match = False
                if hasattr(p, "match"):  # regexp
                    while pattern_end < len(nodes_as_array) and p.match(
                        nodes_as_array[pattern_end].name
                    ):
                        match = True
                        pattern_end += 1
                else:  # exact string
                    match = nodes_as_array[pattern_end].op == p
                    pattern_end += 1

                if not match:
                    break

            def get_tensors(pattern_nodes):
                nonlocal o_context
                map_ignored_layer_to_its_input = (
                    o_context.map_ignored_layer_to_its_input
                )
                model_tensors = o_context.model_tensors

                # tensors <= all Const nodes within this pattern
                const_nodes = [n for n in pattern_nodes if n.op == "Const"]

                # TODO: unify / reuse code from process_layer
                identity_nodes = [n for n in pattern_nodes if n.op == "Identity"]
                for i in identity_nodes:
                    inputs = replace_strings_in_list(
                        i.input, map_ignored_layer_to_its_input
                    )
                    map_ignored_layer_to_its_input[i.name] = inputs

                # gather inputs from Op nodes (not Const, not Identity)
                op_nodes = [
                    n
                    for n in pattern_nodes
                    if n not in const_nodes and n not in identity_nodes
                ]
                inputs_to_op_nodes = list(
                    flatten([list(flatten(n.input)) for n in op_nodes])
                )
                inputs_to_op_nodes = replace_strings_in_list(
                    inputs_to_op_nodes, map_ignored_layer_to_its_input
                )
                inputs_to_op_nodes = [i.split(":")[0] for i in inputs_to_op_nodes]

                const_nodes_by_name = {n.name: n for n in const_nodes}
                tensors = []
                for i in inputs_to_op_nodes:
                    if i in model_tensors:
                        src = model_tensors[i]
                        tensors += [
                            Struct(
                                name=i,
                                obj=src,
                                shape=get_tensor_dims(src),
                                data=get_tensor_data(src),
                            )
                        ]
                    elif i in const_nodes_by_name:
                        src = const_nodes_by_name[i].attr["value"].tensor
                        tensors += [
                            Struct(
                                name=i,
                                obj=src,
                                shape=get_tensor_dims(src),
                                data=get_tensor_data(src),
                            )
                        ]
                tensor_names = [n.name for n in tensors]

                # filter only inputs that are coming from nodes that are outside this pattern
                # preserve the order
                pattern_nodes = [n.name for n in pattern_nodes] + tensor_names
                # inputs_from_outside_pattern = remove_duplicates_from_list([i for i in inputs_to_op_nodes if
                #   nodes_by_name[i] not in pattern_nodes])
                inputs_from_outside_pattern = remove_duplicates_from_list(
                    [i for i in inputs_to_op_nodes if i not in pattern_nodes]
                )

                return inputs_from_outside_pattern, tensors

            if match:
                nodes = nodes_as_array[node_index:pattern_end]
                name = nodes[-1].name
                var_tensors, const_tensors = get_tensors(nodes)
                if args.print_patterns or args.verbose:
                    print(
                        "PATTERN:",
                        name,
                        "~~",
                        pattern_name,
                        "<-",
                        var_tensors,
                        "+",
                        [t.name for t in const_tensors],
                    )
                    print("        ", pattern)
                for n in nodes:
                    if n.op == "Const" or n.op == "Identity":
                        process_layer(n, o_context, args)

                new_layers = transform_patterns[pattern_name](
                    nodes, var_tensors, const_tensors, o_context
                )
                if not isinstance(new_layers, list):
                    if not hasattr(new_layers, name):
                        new_layers.name = name
                    new_layers = [new_layers]

                for l in new_layers:
                    # TODO: prefix new layer names with scope, patch inputs
                    # l.name = name + '/' + l.name
                    process_layer(l, o_context, args)

                node_index = pattern_end
                break  # pattern found & processed

        if not match:
            # TODO: gather tensors in the same way as patterns do
            process_layer(node, o_context, args)
            node_index += 1

    def find_unconnected_const_nodes(nodes):
        nodes_with_consts = {node.name: node for node in nodes if node.op == "Const"}
        for node in nodes:
            for i in node.input:
                nodes_with_consts.pop(i, None)
        return list(nodes_with_consts.keys())

    return (
        o_context.layers,
        o_context.input_shapes,
        o_context.model_tensors,
        o_context.model_memories,
        find_unconnected_const_nodes(nodes_as_array),
    )


# Sort nodes so that all input dependencies are satisfied beforehand
# while preserving original order of the nodes in the model whenever possible.
# NOITE: preservation of original order is important for pattern matching
def slow_but_stable_topological_sort(nodes, verbose):

    nodes_with_consts = [node for node in nodes if node.op == "Const"]
    nodes_for_sorting = [node for node in nodes if node.op != "Const"]

    # TODO: optimize for performance
    # based on http://blog.gapotchenko.com/stable-topological-sort

    def assign_ids(nodes):
        ids = []
        id_by_name = {}
        id = 0
        for node in nodes:
            id_by_name[node.name] = id
            ids.append(id)
            id += 1

        inputs_by_id = [None] * len(nodes)
        for node in nodes:
            id = id_by_name[node.name]
            inputs_by_id[id] = {id_by_name.get(i, -1) for i in node.input}

        return ids, inputs_by_id

    def sort(ids, inputs_by_id, verbose_lambda):
        sorted = False
        n = len(ids)
        while not sorted:
            sorted = True
            for i in range(n):
                for j in range(i):
                    if ids[i] in inputs_by_id[ids[j]]:
                        tmp = ids.pop(i)
                        ids.insert(j, tmp)
                        sorted = False
            verbose_lambda(sorted)
        return ids

    prefix_printed = False

    def print_status(sorted):
        nonlocal prefix_printed
        if not sorted:
            if not prefix_printed:
                print("Sorting model, may take a while...", end="", flush=True)
                prefix_printed = True
            else:
                print(".", end="", flush=True)
        else:
            if prefix_printed:
                print(" Done!")

    ids, inputs_by_id = assign_ids(nodes_for_sorting)
    ids = sort(
        ids, inputs_by_id, lambda sorted: print_status(sorted) if verbose else None
    )

    assert len(ids) == len(nodes_for_sorting)
    assert len(ids) + len(nodes_with_consts) == len(nodes)
    return nodes_with_consts + [nodes_for_sorting[id] for id in ids]


def very_slow_but_stable_topological_sort(nodes, verbose):
    # TODO: optimize for performance
    # based on http://blog.gapotchenko.com/stable-topological-sort
    n = len(nodes)
    sorted = False

    while not sorted:
        sorted = True
        for i in range(n):
            for j in range(i):
                if nodes[i].name in nodes[j].input:
                    tmp = nodes.pop(i)
                    nodes.insert(j, tmp)
                    sorted = False
    assert len(nodes) == n
    return nodes


#########################################################


def convert(
    source_file,
    target_file,
    trim_unused_by_output="",
    verbose=False,
    compress_f16=False,
):
    """
    Converts a TensorFlow model into a Barracuda model.
    :param source_file: The TensorFlow Model
    :param target_file: The name of the file the converted model will be saved to
    :param trim_unused_by_output: The regexp to match output nodes to remain in the model.
        All other unconnected nodes will be removed.
    :param verbose: If True, will display debug messages
    :param compress_f16: If true, the float values will be converted to f16
    :return:
    """
    if type(verbose) == bool:
        args = Struct()
        args.verbose = verbose
        args.print_layers = verbose
        args.print_source_json = verbose
        args.print_barracuda_json = verbose
        args.print_layer_links = verbose
        args.print_patterns = verbose
        args.print_tensors = verbose
        args.print_supported_ops = verbose
    else:
        args = verbose

    if args.print_supported_ops:
        barracuda.print_known_operations(known_classes, known_activations)

    # Load Tensorflow model
    print("Converting %s to %s" % (source_file, target_file))
    f = open(source_file, "rb")
    i_model = tf.GraphDef()
    i_model.ParseFromString(f.read())

    if args.verbose:
        print("OP_TYPES:", {layer.op for layer in i_model.node})

    if args.print_source_json or args.verbose:
        for layer in i_model.node:
            if not layer.op == "Const":
                print("MODEL:", MessageToJson(layer) + ",")

    # Convert
    o_model = barracuda.Model()
    o_model.layers, o_input_shapes, o_model.tensors, o_model.memories, o_model.globals = process_model(
        i_model, args
    )

    # Cleanup unconnected Identities (they might linger after processing complex node patterns like LSTM)
    def cleanup_layers(layers):
        all_layers = {l.name for l in layers}
        all_inputs = {i for l in layers for i in l.inputs}

        def is_unconnected_identity(layer):
            if layer.class_name == "Activation" and layer.activation == 0:  # Identity
                assert len(layer.inputs) == 1
                if layer.inputs[0] not in all_layers and layer.name not in all_inputs:
                    return True
            return False

        return [l for l in layers if not is_unconnected_identity(l)]

    o_model.layers = cleanup_layers(o_model.layers)

    all_inputs = {i for l in o_model.layers for i in l.inputs}

    # Trim
    if trim_unused_by_output:
        o_model.layers = barracuda.trim(
            o_model.layers, trim_unused_by_output, args.verbose
        )

    # Create load layer for constants
    def dims_to_barracuda_shape(dims):
        shape = list(dims)
        while len(shape) < 4:
            shape = [1] + shape
        return shape

    const_tensors = [i for i in all_inputs if i in o_model.tensors]
    const_tensors += o_model.globals
    for x in const_tensors:
        shape = dims_to_barracuda_shape(get_tensor_dims(o_model.tensors[x]))
        o_l = Struct(
            type=255,  # Load
            class_name="Const",
            name=x,
            pads=[0, 0, 0, 0],
            strides=[],
            pool_size=[],
            axis=-1,
            alpha=1,
            beta=0,
            activation=0,
            inputs=[],
            tensors=[
                Struct(
                    name=x,
                    shape=shape,
                    data=np.reshape(get_tensor_data(o_model.tensors[x]), shape).astype(
                        np.float32
                    ),
                )
            ],
        )
        o_model.layers.insert(0, o_l)

    # Find model inputs & outputs
    all_layers = {l.name for l in o_model.layers}
    # global inputs => are inputs that are NOT connected to any layer in the network
    # global outputs => are outputs that are NOT feeding any layer in the network OR are coming from Identity layers
    o_model.inputs = {
        i: o_input_shapes[i]
        for l in o_model.layers
        for i in l.inputs
        if i not in all_layers and i not in o_model.memories
    }

    def is_output_layer(layer):
        if (
            layer.class_name == "Const"
        ):  # Constants never count as global output even when unconnected
            return False
        if (
            layer.name not in all_inputs
        ):  # this layer is not inputing to any other layer
            return True
        if (
            layer.class_name == "Activation" and layer.activation == 0
        ):  # Identity marks global output
            return True
        return False

    o_model.outputs = [l.name for l in o_model.layers if is_output_layer(l)]

    # Compress
    if compress_f16:
        o_model = barracuda.compress(o_model)

    # Sort model so that layer inputs are always ready upfront
    o_model.layers = barracuda.sort(
        o_model.layers, o_model.inputs, o_model.memories, args.verbose
    )
    o_model.layers = barracuda.fuse(o_model.layers, args.verbose)

    # Summary
    barracuda.summary(
        o_model,
        print_layer_links=args.print_layer_links or args.verbose,
        print_barracuda_json=args.print_barracuda_json or args.verbose,
        print_tensors=args.print_tensors or args.verbose,
    )

    # Write to file
    barracuda.write(o_model, target_file)
    print("DONE: wrote", target_file, "file.")