out("ymm1") _,
);
}
+ } else if dst.as_ptr().align_offset(16) == 0 && src.as_ptr().align_offset(16) == 0 &&
+ (w % 64) == 0 && ((dstride | sstride) % 16) == 0 {
+ unsafe {
+ asm!(
+ "2:",
+ " mov {x}, {w}",
+ " 3:",
+ " movdqa xmm0, [{src}]",
+ " movdqa xmm1, [{src}+16]",
+ " movdqa xmm2, [{src}+32]",
+ " movdqa xmm3, [{src}+48]",
+ " movdqa [{dst}], xmm0",
+ " movdqa [{dst}+16], xmm1",
+ " movdqa [{dst}+32], xmm2",
+ " movdqa [{dst}+48], xmm3",
+ " add {src}, 64",
+ " add {dst}, 64",
+ " sub {x}, 64",
+ " jnz 3b",
+ " add {src}, {sstep}",
+ " add {dst}, {dstep}",
+ " dec {h}",
+ " jnz 2b",
+ dst = inout(reg) dst.as_mut_ptr() => _,
+ src = inout(reg) src.as_ptr() => _,
+ sstep = in(reg) sstride - w,
+ dstep = in(reg) dstride - w,
+ w = in(reg) w,
+ h = inout(reg) h => _,
+ x = out(reg) _,
+ out("xmm0") _,
+ out("xmm1") _,
+ out("xmm2") _,
+ out("xmm3") _,
+ );
+ }
} else {
let copy_len = dstride.min(w);
for (dline, sline) in dst.chunks_mut(dstride)