module name=conv1d_manual_backward, target=x86-64-linux-avx-f16c-profile-sse41 {
func conv1d_manual_backward(input, filter, d_output, d_input)
{
register_destructor("halide_profiler_pipeline_end", halide_profiler_get_state())
allocate profiling_func_names[(void *) * 3]
profiling_func_names[0] = "overhead"
profiling_func_names[2] = "f_input_d"
profiling_func_names[1] = "d_input"
allocate profiling_func_stack_peak_buf[uint64 * 3]
profiling_func_stack_peak_buf[0] = (uint64)0
profiling_func_stack_peak_buf[1] = (uint64)0
profiling_func_stack_peak_buf[2] = (uint64)4

let profiler_token = halide_profiler_pipeline_start("conv1d_manual_backward", 3, profiling_func_names)

assert((0 <= profiler_token), profiler_token)

let profiler_state = halide_profiler_get_state()

let profiler_pipeline_state = halide_profiler_get_pipeline_state("conv1d_manual_backward")

halide_profiler_incr_active_threads(profiler_state)
halide_profiler_stack_peak_update(profiler_pipeline_state, profiling_func_stack_peak_buf)
assert((reinterpret(input.buffer) != (uint64)0), halide_error_buffer_argument_is_null("input"))
assert((reinterpret(filter.buffer) != (uint64)0), halide_error_buffer_argument_is_null("filter"))
assert((reinterpret(d_output.buffer) != (uint64)0), halide_error_buffer_argument_is_null("d_output"))
assert((reinterpret(d_input.buffer) != (uint64)0), halide_error_buffer_argument_is_null("d_input"))

let d_input = _halide_buffer_get_host(d_input.buffer)

let d_input.type.code = _halide_buffer_get_type_code(d_input.buffer)

let d_input.type.bits = _halide_buffer_get_type_bits(d_input.buffer)

let d_input.type.lanes = _halide_buffer_get_type_lanes(d_input.buffer)

let d_input.min.0 = _halide_buffer_get_min(d_input.buffer, 0)

let d_input.extent.0 = _halide_buffer_get_extent(d_input.buffer, 0)

let d_input.stride.0 = _halide_buffer_get_stride(d_input.buffer, 0)

let d_input.min.1 = _halide_buffer_get_min(d_input.buffer, 1)

let d_input.extent.1 = _halide_buffer_get_extent(d_input.buffer, 1)

let d_input.stride.1 = _halide_buffer_get_stride(d_input.buffer, 1)

let d_input.min.2 = _halide_buffer_get_min(d_input.buffer, 2)

let d_input.extent.2 = _halide_buffer_get_extent(d_input.buffer, 2)

let d_input.stride.2 = _halide_buffer_get_stride(d_input.buffer, 2)

let d_output = _halide_buffer_get_host(d_output.buffer)

let d_output.type.code = _halide_buffer_get_type_code(d_output.buffer)

let d_output.type.bits = _halide_buffer_get_type_bits(d_output.buffer)

let d_output.type.lanes = _halide_buffer_get_type_lanes(d_output.buffer)

let d_output.min.0 = _halide_buffer_get_min(d_output.buffer, 0)

let d_output.extent.0 = _halide_buffer_get_extent(d_output.buffer, 0)

let d_output.stride.0 = _halide_buffer_get_stride(d_output.buffer, 0)

let d_output.min.1 = _halide_buffer_get_min(d_output.buffer, 1)

let d_output.extent.1 = _halide_buffer_get_extent(d_output.buffer, 1)

let d_output.stride.1 = _halide_buffer_get_stride(d_output.buffer, 1)

let d_output.min.2 = _halide_buffer_get_min(d_output.buffer, 2)

let d_output.extent.2 = _halide_buffer_get_extent(d_output.buffer, 2)

let d_output.stride.2 = _halide_buffer_get_stride(d_output.buffer, 2)

let filter = _halide_buffer_get_host(filter.buffer)

let filter.type.code = _halide_buffer_get_type_code(filter.buffer)

let filter.type.bits = _halide_buffer_get_type_bits(filter.buffer)

let filter.type.lanes = _halide_buffer_get_type_lanes(filter.buffer)

let filter.min.0 = _halide_buffer_get_min(filter.buffer, 0)

let filter.extent.0 = _halide_buffer_get_extent(filter.buffer, 0)

let filter.stride.0 = _halide_buffer_get_stride(filter.buffer, 0)

let filter.min.1 = _halide_buffer_get_min(filter.buffer, 1)

let filter.extent.1 = _halide_buffer_get_extent(filter.buffer, 1)

let filter.stride.1 = _halide_buffer_get_stride(filter.buffer, 1)

let filter.min.2 = _halide_buffer_get_min(filter.buffer, 2)

let filter.extent.2 = _halide_buffer_get_extent(filter.buffer, 2)

let filter.stride.2 = _halide_buffer_get_stride(filter.buffer, 2)

let input.type.code = _halide_buffer_get_type_code(input.buffer)

let input.type.bits = _halide_buffer_get_type_bits(input.buffer)

let input.type.lanes = _halide_buffer_get_type_lanes(input.buffer)

let input.min.0 = _halide_buffer_get_min(input.buffer, 0)

let input.extent.0 = _halide_buffer_get_extent(input.buffer, 0)

let input.stride.0 = _halide_buffer_get_stride(input.buffer, 0)

let input.min.1 = _halide_buffer_get_min(input.buffer, 1)

let input.extent.1 = _halide_buffer_get_extent(input.buffer, 1)

let input.stride.1 = _halide_buffer_get_stride(input.buffer, 1)

let input.min.2 = _halide_buffer_get_min(input.buffer, 2)

let input.extent.2 = _halide_buffer_get_extent(input.buffer, 2)

let input.stride.2 = _halide_buffer_get_stride(input.buffer, 2)

let d_input.extent.1.required.s = (((d_input.extent.2 * d_input.extent.1) + -1) / max(d_input.extent.2, 1))

let d_input.extent.2.required = int32(abs(max(d_input.extent.2, 1)))

let d_input.stride.2.required = (d_input.extent.0 * (d_input.extent.1.required.s + 1))

let d_output.extent.0.required = ((min((filter.extent.0 + -1), ((d_input.min.0 + d_input.extent.0) + -2)) + ((d_input.min.0 + d_input.extent.0) - max(((d_input.min.0 - input.extent.0) + 1), 0))) - d_input.min.0)

let d_output.min.0.required = (d_input.min.0 - min((filter.extent.0 + -1), ((d_input.min.0 + d_input.extent.0) + -2)))

let filter.extent.0.required = (min(filter.extent.0, ((d_input.min.0 + d_input.extent.0) + -1)) - max(((d_input.min.0 - input.extent.0) + 1), 0))

if (_halide_buffer_is_bounds_query(d_input.buffer))
{
_halide_buffer_init(d_input.buffer, _halide_buffer_get_shape(d_input.buffer), reinterpret((uint64)0), (uint64)0, reinterpret((uint64)0), 2, 32, 3, make_struct(d_input.min.0, d_input.extent.0, 1, 0, d_input.min.1, (d_input.extent.1.required.s + 1), d_input.extent.0, 0, d_input.min.2, d_input.extent.2.required, d_input.stride.2.required, 0), (uint64)0)
}
if (_halide_buffer_is_bounds_query(d_output.buffer))
{
_halide_buffer_init(d_output.buffer, _halide_buffer_get_shape(d_output.buffer), reinterpret((uint64)0), (uint64)0, reinterpret((uint64)0), 2, 32, 3, make_struct(d_output.min.0.required, d_output.extent.0.required, 1, 0, 0, d_output.extent.1, d_output.extent.0.required, 0, d_input.min.2, d_input.extent.2, (d_output.extent.0.required * d_output.extent.1), 0), (uint64)0)
}
if (_halide_buffer_is_bounds_query(filter.buffer))
{
_halide_buffer_init(filter.buffer, _halide_buffer_get_shape(filter.buffer), reinterpret((uint64)0), (uint64)0, reinterpret((uint64)0), 2, 32, 3, make_struct(max(((d_input.min.0 - input.extent.0) + 1), 0), filter.extent.0.required, 1, 0, d_input.min.1, d_input.extent.1, filter.extent.0.required, 0, 0, d_output.extent.1, (filter.extent.0.required * d_input.extent.1), 0), (uint64)0)
}
if (_halide_buffer_is_bounds_query(input.buffer))
{
_halide_buffer_init(input.buffer, _halide_buffer_get_shape(input.buffer), reinterpret((uint64)0), (uint64)0, reinterpret((uint64)0), 2, 32, 3, make_struct(input.min.0, input.extent.0, 1, 0, input.min.1, input.extent.1, input.extent.0, 0, input.min.2, input.extent.2, (input.extent.0 * input.extent.1), 0), (uint64)0)
}
if (!(((_halide_buffer_is_bounds_query(d_input.buffer) || _halide_buffer_is_bounds_query(d_output.buffer)) || _halide_buffer_is_bounds_query(filter.buffer)) || _halide_buffer_is_bounds_query(input.buffer)))
{
assert((((d_input.type.code == (uint8)2) && (d_input.type.bits == (uint8)32)) && (d_input.type.lanes == (uint16)1)), halide_error_bad_type("Output buffer d_input", d_input.type.code, (uint8)2, d_input.type.bits, (uint8)32, d_input.type.lanes, (uint16)1))
assert((((d_output.type.code == (uint8)2) && (d_output.type.bits == (uint8)32)) && (d_output.type.lanes == (uint16)1)), halide_error_bad_type("Input buffer d_output", d_output.type.code, (uint8)2, d_output.type.bits, (uint8)32, d_output.type.lanes, (uint16)1))
assert((((filter.type.code == (uint8)2) && (filter.type.bits == (uint8)32)) && (filter.type.lanes == (uint16)1)), halide_error_bad_type("Input buffer filter", filter.type.code, (uint8)2, filter.type.bits, (uint8)32, filter.type.lanes, (uint16)1))
assert((((input.type.code == (uint8)2) && (input.type.bits == (uint8)32)) && (input.type.lanes == (uint16)1)), halide_error_bad_type("Input buffer input", input.type.code, (uint8)2, input.type.bits, (uint8)32, input.type.lanes, (uint16)1))
assert((0 <= d_input.extent.0), halide_error_buffer_extents_negative("Output buffer d_input", 0, d_input.extent.0))
assert(((((d_input.min.1 + d_input.extent.1.required.s) - d_input.extent.1) + 1) <= d_input.min.1), halide_error_access_out_of_bounds("Output buffer d_input", 1, d_input.min.1, (d_input.min.1 + d_input.extent.1.required.s), d_input.min.1, ((d_input.min.1 + d_input.extent.1) + -1)))
assert((0 <= d_input.extent.1), halide_error_buffer_extents_negative("Output buffer d_input", 1, d_input.extent.1))
assert((((d_input.min.2 + d_input.extent.2.required) - d_input.extent.2) <= d_input.min.2), halide_error_access_out_of_bounds("Output buffer d_input", 2, d_input.min.2, ((d_input.min.2 + d_input.extent.2.required) + -1), d_input.min.2, ((d_input.min.2 + d_input.extent.2) + -1)))
assert((0 <= d_input.extent.2), halide_error_buffer_extents_negative("Output buffer d_input", 2, d_input.extent.2))
assert(((d_output.min.0 <= d_output.min.0.required) && (((d_output.min.0.required + d_output.extent.0.required) - d_output.extent.0) <= d_output.min.0)), halide_error_access_out_of_bounds("Input buffer d_output", 0, d_output.min.0.required, ((d_output.min.0.required + d_output.extent.0.required) + -1), d_output.min.0, ((d_output.min.0 + d_output.extent.0) + -1)))
assert((0 <= d_output.extent.0), halide_error_buffer_extents_negative("Input buffer d_output", 0, d_output.extent.0))
assert(((d_output.min.1 <= 0) && (0 <= d_output.min.1)), halide_error_access_out_of_bounds("Input buffer d_output", 1, 0, (d_output.extent.1 + -1), d_output.min.1, ((d_output.min.1 + d_output.extent.1) + -1)))
assert((0 <= d_output.extent.1), halide_error_buffer_extents_negative("Input buffer d_output", 1, d_output.extent.1))
assert(((d_output.min.2 <= d_input.min.2) && (((d_input.min.2 + d_input.extent.2) - d_output.extent.2) <= d_output.min.2)), halide_error_access_out_of_bounds("Input buffer d_output", 2, d_input.min.2, ((d_input.min.2 + d_input.extent.2) + -1), d_output.min.2, ((d_output.min.2 + d_output.extent.2) + -1)))
assert((0 <= d_output.extent.2), halide_error_buffer_extents_negative("Input buffer d_output", 2, d_output.extent.2))
assert(((filter.min.0 <= max(((d_input.min.0 - input.extent.0) + 1), 0)) && (((max(((d_input.min.0 - input.extent.0) + 1), 0) + filter.extent.0.required) - filter.extent.0) <= filter.min.0)), halide_error_access_out_of_bounds("Input buffer filter", 0, max(((d_input.min.0 - input.extent.0) + 1), 0), ((max(((d_input.min.0 - input.extent.0) + 1), 0) + filter.extent.0.required) + -1), filter.min.0, ((filter.min.0 + filter.extent.0) + -1)))
assert((0 <= filter.extent.0), halide_error_buffer_extents_negative("Input buffer filter", 0, filter.extent.0))
assert(((filter.min.1 <= d_input.min.1) && (((d_input.min.1 + d_input.extent.1) - filter.extent.1) <= filter.min.1)), halide_error_access_out_of_bounds("Input buffer filter", 1, d_input.min.1, ((d_input.min.1 + d_input.extent.1) + -1), filter.min.1, ((filter.min.1 + filter.extent.1) + -1)))
assert((0 <= filter.extent.1), halide_error_buffer_extents_negative("Input buffer filter", 1, filter.extent.1))
assert(((filter.min.2 <= 0) && ((d_output.extent.1 - filter.extent.2) <= filter.min.2)), halide_error_access_out_of_bounds("Input buffer filter", 2, 0, (d_output.extent.1 + -1), filter.min.2, ((filter.min.2 + filter.extent.2) + -1)))
assert((0 <= filter.extent.2), halide_error_buffer_extents_negative("Input buffer filter", 2, filter.extent.2))
assert((0 <= input.extent.0), halide_error_buffer_extents_negative("Input buffer input", 0, input.extent.0))
assert((0 <= input.extent.1), halide_error_buffer_extents_negative("Input buffer input", 1, input.extent.1))
assert((0 <= input.extent.2), halide_error_buffer_extents_negative("Input buffer input", 2, input.extent.2))
assert((d_input.stride.0 == 1), halide_error_constraint_violated("d_input.stride.0", d_input.stride.0, "1", 1))
assert((d_output.stride.0 == 1), halide_error_constraint_violated("d_output.stride.0", d_output.stride.0, "1", 1))
assert((filter.stride.0 == 1), halide_error_constraint_violated("filter.stride.0", filter.stride.0, "1", 1))
assert((input.stride.0 == 1), halide_error_constraint_violated("input.stride.0", input.stride.0, "1", 1))

let d_input.total_extent.1 = (int64(d_input.extent.1) * int64(d_input.extent.0))

let d_output.total_extent.1 = (int64(d_output.extent.1) * int64(d_output.extent.0))

let filter.total_extent.1 = (int64(filter.extent.1) * int64(filter.extent.0))

let input.total_extent.1 = (int64(input.extent.1) * int64(input.extent.0))

assert((abs(int64(d_input.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("d_input", abs(int64(d_input.extent.0)), (uint64)2147483647))
assert((abs((int64(d_input.extent.1) * int64(d_input.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("d_input", abs((int64(d_input.extent.1) * int64(d_input.stride.1))), (uint64)2147483647))
assert((d_input.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("d_input", d_input.total_extent.1, (int64)2147483647))
assert((abs((int64(d_input.extent.2) * int64(d_input.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("d_input", abs((int64(d_input.extent.2) * int64(d_input.stride.2))), (uint64)2147483647))
assert(((int64(d_input.extent.2) * d_input.total_extent.1) <= (int64)2147483647), halide_error_buffer_extents_too_large("d_input", (int64(d_input.extent.2) * d_input.total_extent.1), (int64)2147483647))
assert((abs(int64(d_output.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("d_output", abs(int64(d_output.extent.0)), (uint64)2147483647))
assert((abs((int64(d_output.extent.1) * int64(d_output.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("d_output", abs((int64(d_output.extent.1) * int64(d_output.stride.1))), (uint64)2147483647))
assert((d_output.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("d_output", d_output.total_extent.1, (int64)2147483647))
assert((abs((int64(d_output.extent.2) * int64(d_output.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("d_output", abs((int64(d_output.extent.2) * int64(d_output.stride.2))), (uint64)2147483647))
assert(((int64(d_output.extent.2) * d_output.total_extent.1) <= (int64)2147483647), halide_error_buffer_extents_too_large("d_output", (int64(d_output.extent.2) * d_output.total_extent.1), (int64)2147483647))
assert((abs(int64(filter.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("filter", abs(int64(filter.extent.0)), (uint64)2147483647))
assert((abs((int64(filter.extent.1) * int64(filter.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("filter", abs((int64(filter.extent.1) * int64(filter.stride.1))), (uint64)2147483647))
assert((filter.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("filter", filter.total_extent.1, (int64)2147483647))
assert((abs((int64(filter.extent.2) * int64(filter.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("filter", abs((int64(filter.extent.2) * int64(filter.stride.2))), (uint64)2147483647))
assert(((int64(filter.extent.2) * filter.total_extent.1) <= (int64)2147483647), halide_error_buffer_extents_too_large("filter", (int64(filter.extent.2) * filter.total_extent.1), (int64)2147483647))
assert((abs(int64(input.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("input", abs(int64(input.extent.0)), (uint64)2147483647))
assert((abs((int64(input.extent.1) * int64(input.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("input", abs((int64(input.extent.1) * int64(input.stride.1))), (uint64)2147483647))
assert((input.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("input", input.total_extent.1, (int64)2147483647))
assert((abs((int64(input.extent.2) * int64(input.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("input", abs((int64(input.extent.2) * int64(input.stride.2))), (uint64)2147483647))
assert(((int64(input.extent.2) * input.total_extent.1) <= (int64)2147483647), halide_error_buffer_extents_too_large("input", (int64(input.extent.2) * input.total_extent.1), (int64)2147483647))
assert((d_input != reinterpret((uint64)0)), halide_error_host_is_null("Output buffer d_input"))
assert((d_output != reinterpret((uint64)0)), halide_error_host_is_null("Input buffer d_output"))
assert((filter != reinterpret((uint64)0)), halide_error_host_is_null("Input buffer filter"))
produce d_input
{
halide_profiler_set_current_func(profiler_state, profiler_token, 1)
halide_profiler_decr_active_threads(profiler_state)

let t55 = (d_input.min.0 + d_input.extent.0)

let t56 = min(max(filter.extent.0, d_input.min.0), (d_input.extent.0 + d_input.min.0))

let t48 = max(min(input.extent.0, t55), t56)

let t49 = max(d_input.extent.2, 1)

let t46 = (d_input.extent.2 * d_input.extent.1)

let t50 = (1 - input.extent.0)

let t52 = ((filter.min.0 + (filter.min.1 * filter.stride.1)) + (filter.min.2 * filter.stride.2))

let t51 = ((d_output.min.0 + (d_output.min.1 * d_output.stride.1)) + (d_output.min.2 * d_output.stride.2))

let t53 = ((d_input.min.0 + (d_input.min.1 * d_input.stride.1)) + (d_input.min.2 * d_input.stride.2))

parallel (d_input.s0.n.nc, 0, t46)
{
halide_profiler_incr_active_threads(profiler_state)

let d_input.s0.x.prologue = t56

let d_input.s0.x.epilogue = t48

let t58 = (d_input.s0.n.nc % t49)

let t59 = (d_input.s0.n.nc / t49)

let t57 = (d_input.s0.x.prologue - d_input.min.0)

for (d_input.s0.x, d_input.min.0, t57)
{

let d_input.s0.n.min_1.s = t58

let d_input.s0.ci.min_1.s = t59

allocate f_input_d[float32 * 1]
produce f_input_d
{
halide_profiler_set_current_func(profiler_state, profiler_token, 2)
f_input_d[0] = 0.000000f

let t60 = min((t50 + d_input.s0.x), filter.extent.0)

let t61 = min(d_input.s0.x, filter.extent.0)

let t63 = (((d_input.min.1 + d_input.s0.ci.min_1.s) * filter.stride.1) - t52)

let t62 = ((((d_input.min.2 + d_input.s0.n.min_1.s) * d_output.stride.2) - t51) + d_input.s0.x)

for (f_input_d.s1.r4$y, 0, d_output.extent.1)
{

let f_input_d.s1.r4$x.new_min.s = t60

let f_input_d.s1.r4$x.new_max = max(t61, max(f_input_d.s1.r4$x.new_min.s, 0))

let t68 = max(f_input_d.s1.r4$x.new_min.s, 0)

let t65 = (f_input_d.s1.r4$x.new_max - t68)

let t67 = (t63 + (f_input_d.s1.r4$y * filter.stride.2))

let t66 = (t62 + (f_input_d.s1.r4$y * d_output.stride.1))

for (f_input_d.s1.r4$x, t68, t65)
{
f_input_d[0] = (f_input_d[0] + (d_output[(t66 - f_input_d.s1.r4$x)] * filter[(t67 + f_input_d.s1.r4$x)]))
}
}
}
consume f_input_d
{
halide_profiler_set_current_func(profiler_state, profiler_token, 1)
d_input[(((((d_input.min.1 + d_input.s0.ci.min_1.s) * d_input.stride.1) - t53) + ((d_input.min.2 + d_input.s0.n.min_1.s) * d_input.stride.2)) + d_input.s0.x)] = f_input_d[0]
}
free f_input_d
}

let t70 = (d_input.s0.n.nc % t49)

let t71 = (d_input.s0.n.nc / t49)

let t69 = (d_input.s0.x.epilogue - d_input.s0.x.prologue)

for (d_input.s0.x, d_input.s0.x.prologue, t69)
{

let d_input.s0.n.min_1.s = t70

let d_input.s0.ci.min_1.s = t71

allocate f_input_d[float32 * 1]
produce f_input_d
{
halide_profiler_set_current_func(profiler_state, profiler_token, 2)
f_input_d[0] = 0.000000f

let t73 = (((d_input.min.1 + d_input.s0.ci.min_1.s) * filter.stride.1) - t52)

let t72 = ((((d_input.min.2 + d_input.s0.n.min_1.s) * d_output.stride.2) - t51) + d_input.s0.x)

for (f_input_d.s1.r4$y, 0, d_output.extent.1)
{

let t75 = (t73 + (f_input_d.s1.r4$y * filter.stride.2))

let t74 = (t72 + (f_input_d.s1.r4$y * d_output.stride.1))

for (f_input_d.s1.r4$x, 0, filter.extent.0)
{
f_input_d[0] = (f_input_d[0] + (d_output[(t74 - f_input_d.s1.r4$x)] * filter[(t75 + f_input_d.s1.r4$x)]))
}
}
}
consume f_input_d
{
halide_profiler_set_current_func(profiler_state, profiler_token, 1)
d_input[(((((d_input.min.1 + d_input.s0.ci.min_1.s) * d_input.stride.1) - t53) + ((d_input.min.2 + d_input.s0.n.min_1.s) * d_input.stride.2)) + d_input.s0.x)] = f_input_d[0]
}
free f_input_d
}

let t77 = (d_input.s0.n.nc % t49)

let t78 = (d_input.s0.n.nc / t49)

let t76 = (t55 - d_input.s0.x.epilogue)

for (d_input.s0.x, d_input.s0.x.epilogue, t76)
{

let d_input.s0.n.min_1.s = t77

let d_input.s0.ci.min_1.s = t78

allocate f_input_d[float32 * 1]
produce f_input_d
{
halide_profiler_set_current_func(profiler_state, profiler_token, 2)
f_input_d[0] = 0.000000f

let t79 = min((t50 + d_input.s0.x), filter.extent.0)

let t81 = (((d_input.min.1 + d_input.s0.ci.min_1.s) * filter.stride.1) - t52)

let t80 = ((((d_input.min.2 + d_input.s0.n.min_1.s) * d_output.stride.2) - t51) + d_input.s0.x)

for (f_input_d.s1.r4$y, 0, d_output.extent.1)
{

let f_input_d.s1.r4_x.new_min$1.s = t79

let t86 = max(f_input_d.s1.r4_x.new_min$1.s, 0)

let t83 = (filter.extent.0 - t86)

let t85 = (t81 + (f_input_d.s1.r4$y * filter.stride.2))

let t84 = (t80 + (f_input_d.s1.r4$y * d_output.stride.1))

for (f_input_d.s1.r4$x, t86, t83)
{
f_input_d[0] = (f_input_d[0] + (d_output[(t84 - f_input_d.s1.r4$x)] * filter[(t85 + f_input_d.s1.r4$x)]))
}
}
}
consume f_input_d
{
halide_profiler_set_current_func(profiler_state, profiler_token, 1)
d_input[(((((d_input.min.1 + d_input.s0.ci.min_1.s) * d_input.stride.1) - t53) + ((d_input.min.2 + d_input.s0.n.min_1.s) * d_input.stride.2)) + d_input.s0.x)] = f_input_d[0]
}
free f_input_d
}
halide_profiler_decr_active_threads(profiler_state)
}
halide_profiler_incr_active_threads(profiler_state)
}
}
halide_profiler_decr_active_threads(profiler_state)
free profiling_func_stack_peak_buf
free profiling_func_names
}
func conv1d_manual_backward_old_buffer_t(input, filter, d_output, d_input)
{

let input.upgraded = (let t87 = make_struct(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) in _halide_buffer_init(alloca(size_of_halide_buffer_t()), t87, reinterpret((uint64)0), (uint64)0, reinterpret((uint64)0), 2, 32, 3, t87, (uint64)0))

let filter.upgraded = (let t91 = make_struct(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) in _halide_buffer_init(alloca(size_of_halide_buffer_t()), t91, reinterpret((uint64)0), (uint64)0, reinterpret((uint64)0), 2, 32, 3, t91, (uint64)0))

let d_output.upgraded = (let t95 = make_struct(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) in _halide_buffer_init(alloca(size_of_halide_buffer_t()), t95, reinterpret((uint64)0), (uint64)0, reinterpret((uint64)0), 2, 32, 3, t95, (uint64)0))

let d_input.upgraded = (let t99 = make_struct(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) in _halide_buffer_init(alloca(size_of_halide_buffer_t()), t99, reinterpret((uint64)0), (uint64)0, reinterpret((uint64)0), 2, 32, 3, t99, (uint64)0))

let t90 = halide_upgrade_buffer_t("input", input, input.upgraded)

assert((t90 == 0), t90)

let t94 = halide_upgrade_buffer_t("filter", filter, filter.upgraded)

assert((t94 == 0), t94)

let t98 = halide_upgrade_buffer_t("d_output", d_output, d_output.upgraded)

assert((t98 == 0), t98)

let t102 = halide_upgrade_buffer_t("d_input", d_input, d_input.upgraded)

assert((t102 == 0), t102)

let t103 = conv1d_manual_backward(input.upgraded, filter.upgraded, d_output.upgraded, d_input.upgraded)

assert((t103 == 0), t103)
if (_halide_buffer_is_bounds_query(input.upgraded))
{

let t88 = halide_downgrade_buffer_t("input", input.upgraded, input)

assert((t88 == 0), t88)
}
else
{

let t89 = halide_downgrade_buffer_t_device_fields("input", input.upgraded, input)

assert((t89 == 0), t89)
}
if (_halide_buffer_is_bounds_query(filter.upgraded))
{

let t92 = halide_downgrade_buffer_t("filter", filter.upgraded, filter)

assert((t92 == 0), t92)
}
else
{

let t93 = halide_downgrade_buffer_t_device_fields("filter", filter.upgraded, filter)

assert((t93 == 0), t93)
}
if (_halide_buffer_is_bounds_query(d_output.upgraded))
{

let t96 = halide_downgrade_buffer_t("d_output", d_output.upgraded, d_output)

assert((t96 == 0), t96)
}
else
{

let t97 = halide_downgrade_buffer_t_device_fields("d_output", d_output.upgraded, d_output)

assert((t97 == 0), t97)
}
if (_halide_buffer_is_bounds_query(d_input.upgraded))
{

let t100 = halide_downgrade_buffer_t("d_input", d_input.upgraded, d_input)

assert((t100 == 0), t100)
}
else
{

let t101 = halide_downgrade_buffer_t_device_fields("d_input", d_input.upgraded, d_input)

assert((t101 == 0), t101)
}
}
}