// Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT.

//go:build !appengine && !noasm && gc && !noasm

#include "textflag.h"

// func _dummy_()
TEXT ·_dummy_(SB), $0
#ifdef GOAMD64_v4
#ifndef GOAMD64_v3
#define GOAMD64_v3
#endif
#endif
	RET

// func encodeBlockAsm(dst []byte, src []byte) int
// Requires: BMI, SSE2
TEXT ·encodeBlockAsm(SB), $65560-56
	MOVQ dst_base+0(FP), AX
	MOVQ $0x00000200, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_encodeBlockAsm:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_encodeBlockAsm
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), CX
	LEAQ  -9(CX), DX
	LEAQ  -8(CX), BX
	MOVL  BX, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  CX, 16(SP)
	MOVQ  src_base+24(FP), DX

search_loop_encodeBlockAsm:
	MOVL  CX, BX
	SUBL  12(SP), BX
	SHRL  $0x06, BX
	LEAL  4(CX)(BX*1), BX
	CMPL  BX, 8(SP)
	JAE   emit_remainder_encodeBlockAsm
	MOVQ  (DX)(CX*1), SI
	MOVL  BX, 20(SP)
	MOVQ  $0x0000cf1bbcdcbf9b, R8
	MOVQ  SI, R9
	MOVQ  SI, R10
	SHRQ  $0x08, R10
	SHLQ  $0x10, R9
	IMULQ R8, R9
	SHRQ  $0x32, R9
	SHLQ  $0x10, R10
	IMULQ R8, R10
	SHRQ  $0x32, R10
	MOVL  24(SP)(R9*4), BX
	MOVL  24(SP)(R10*4), DI
	MOVL  CX, 24(SP)(R9*4)
	LEAL  1(CX), R9
	MOVL  R9, 24(SP)(R10*4)
	MOVQ  SI, R9
	SHRQ  $0x10, R9
	SHLQ  $0x10, R9
	IMULQ R8, R9
	SHRQ  $0x32, R9
	MOVL  CX, R8
	SUBL  16(SP), R8
	MOVL  1(DX)(R8*1), R10
	MOVQ  SI, R8
	SHRQ  $0x08, R8
	CMPL  R8, R10
	JNE   no_repeat_found_encodeBlockAsm
	LEAL  1(CX), SI
	MOVL  12(SP), DI
	MOVL  SI, BX
	SUBL  16(SP), BX
	JZ    repeat_extend_back_end_encodeBlockAsm

repeat_extend_back_loop_encodeBlockAsm:
	CMPL SI, DI
	JBE  repeat_extend_back_end_encodeBlockAsm
	MOVB -1(DX)(BX*1), R8
	MOVB -1(DX)(SI*1), R9
	CMPB R8, R9
	JNE  repeat_extend_back_end_encodeBlockAsm
	LEAL -1(SI), SI
	DECL BX
	JNZ  repeat_extend_back_loop_encodeBlockAsm

repeat_extend_back_end_encodeBlockAsm:
	MOVL SI, BX
	SUBL 12(SP), BX
	LEAQ 5(AX)(BX*1), BX
	CMPQ BX, (SP)
	JB   repeat_dst_size_check_encodeBlockAsm
	MOVQ $0x00000000, ret+48(FP)
	RET

repeat_dst_size_check_encodeBlockAsm:
	MOVL 12(SP), BX
	CMPL BX, SI
	JEQ  emit_literal_done_repeat_emit_encodeBlockAsm
	MOVL SI, R8
	MOVL SI, 12(SP)
	LEAQ (DX)(BX*1), R9
	SUBL BX, R8
	LEAL -1(R8), BX
	CMPL BX, $0x3c
	JB   one_byte_repeat_emit_encodeBlockAsm
	CMPL BX, $0x00000100
	JB   two_bytes_repeat_emit_encodeBlockAsm
	CMPL BX, $0x00010000
	JB   three_bytes_repeat_emit_encodeBlockAsm
	CMPL BX, $0x01000000
	JB   four_bytes_repeat_emit_encodeBlockAsm
	MOVB $0xfc, (AX)
	MOVL BX, 1(AX)
	ADDQ $0x05, AX
	JMP  memmove_long_repeat_emit_encodeBlockAsm

four_bytes_repeat_emit_encodeBlockAsm:
	MOVL BX, R10
	SHRL $0x10, R10
	MOVB $0xf8, (AX)
	MOVW BX, 1(AX)
	MOVB R10, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_repeat_emit_encodeBlockAsm

three_bytes_repeat_emit_encodeBlockAsm:
	MOVB $0xf4, (AX)
	MOVW BX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_repeat_emit_encodeBlockAsm

two_bytes_repeat_emit_encodeBlockAsm:
	MOVB $0xf0, (AX)
	MOVB BL, 1(AX)
	ADDQ $0x02, AX
	CMPL BX, $0x40
	JB   memmove_repeat_emit_encodeBlockAsm
	JMP  memmove_long_repeat_emit_encodeBlockAsm

one_byte_repeat_emit_encodeBlockAsm:
	SHLB $0x02, BL
	MOVB BL, (AX)
	ADDQ $0x01, AX

memmove_repeat_emit_encodeBlockAsm:
	LEAQ (AX)(R8*1), BX

	// genMemMoveShort
	CMPQ R8, $0x08
	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32
	JMP  emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64

emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8:
	MOVQ (R9), R10
	MOVQ R10, (AX)
	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm

emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16:
	MOVQ (R9), R10
	MOVQ -8(R9)(R8*1), R9
	MOVQ R10, (AX)
	MOVQ R9, -8(AX)(R8*1)
	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm

emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32:
	MOVOU (R9), X0
	MOVOU -16(R9)(R8*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R8*1)
	JMP   memmove_end_copy_repeat_emit_encodeBlockAsm

emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64:
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)

memmove_end_copy_repeat_emit_encodeBlockAsm:
	MOVQ BX, AX
	JMP  emit_literal_done_repeat_emit_encodeBlockAsm

memmove_long_repeat_emit_encodeBlockAsm:
	LEAQ (AX)(R8*1), BX

	// genMemMoveLong
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVQ  R8, R11
	SHRQ  $0x05, R11
	MOVQ  AX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R12
	SUBQ  R10, R12
	DECQ  R11
	JA    emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32
	LEAQ  -32(R9)(R12*1), R10
	LEAQ  -32(AX)(R12*1), R13

emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R13)
	MOVOA X5, 16(R13)
	ADDQ  $0x20, R13
	ADDQ  $0x20, R10
	ADDQ  $0x20, R12
	DECQ  R11
	JNA   emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back

emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32:
	MOVOU -32(R9)(R12*1), X4
	MOVOU -16(R9)(R12*1), X5
	MOVOA X4, -32(AX)(R12*1)
	MOVOA X5, -16(AX)(R12*1)
	ADDQ  $0x20, R12
	CMPQ  R8, R12
	JAE   emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)
	MOVQ  BX, AX

emit_literal_done_repeat_emit_encodeBlockAsm:
	ADDL $0x05, CX
	MOVL CX, BX
	SUBL 16(SP), BX
	MOVQ src_len+32(FP), R8
	SUBL CX, R8
	LEAQ (DX)(CX*1), R9
	LEAQ (DX)(BX*1), BX

	// matchLen
	XORL R11, R11

matchlen_loopback_16_repeat_extend_encodeBlockAsm:
	CMPL R8, $0x10
	JB   matchlen_match8_repeat_extend_encodeBlockAsm
	MOVQ (R9)(R11*1), R10
	MOVQ 8(R9)(R11*1), R12
	XORQ (BX)(R11*1), R10
	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm
	XORQ 8(BX)(R11*1), R12
	JNZ  matchlen_bsf_16repeat_extend_encodeBlockAsm
	LEAL -16(R8), R8
	LEAL 16(R11), R11
	JMP  matchlen_loopback_16_repeat_extend_encodeBlockAsm

matchlen_bsf_16repeat_extend_encodeBlockAsm:
#ifdef GOAMD64_v3
	TZCNTQ R12, R12

#else
	BSFQ R12, R12

#endif
	SARQ $0x03, R12
	LEAL 8(R11)(R12*1), R11
	JMP  repeat_extend_forward_end_encodeBlockAsm

matchlen_match8_repeat_extend_encodeBlockAsm:
	CMPL R8, $0x08
	JB   matchlen_match4_repeat_extend_encodeBlockAsm
	MOVQ (R9)(R11*1), R10
	XORQ (BX)(R11*1), R10
	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm
	LEAL -8(R8), R8
	LEAL 8(R11), R11
	JMP  matchlen_match4_repeat_extend_encodeBlockAsm

matchlen_bsf_8_repeat_extend_encodeBlockAsm:
#ifdef GOAMD64_v3
	TZCNTQ R10, R10

#else
	BSFQ R10, R10

#endif
	SARQ $0x03, R10
	LEAL (R11)(R10*1), R11
	JMP  repeat_extend_forward_end_encodeBlockAsm

matchlen_match4_repeat_extend_encodeBlockAsm:
	CMPL R8, $0x04
	JB   matchlen_match2_repeat_extend_encodeBlockAsm
	MOVL (R9)(R11*1), R10
	CMPL (BX)(R11*1), R10
	JNE  matchlen_match2_repeat_extend_encodeBlockAsm
	LEAL -4(R8), R8
	LEAL 4(R11), R11

matchlen_match2_repeat_extend_encodeBlockAsm:
	CMPL R8, $0x01
	JE   matchlen_match1_repeat_extend_encodeBlockAsm
	JB   repeat_extend_forward_end_encodeBlockAsm
	MOVW (R9)(R11*1), R10
	CMPW (BX)(R11*1), R10
	JNE  matchlen_match1_repeat_extend_encodeBlockAsm
	LEAL 2(R11), R11
	SUBL $0x02, R8
	JZ   repeat_extend_forward_end_encodeBlockAsm

matchlen_match1_repeat_extend_encodeBlockAsm:
	MOVB (R9)(R11*1), R10
	CMPB (BX)(R11*1), R10
	JNE  repeat_extend_forward_end_encodeBlockAsm
	LEAL 1(R11), R11

repeat_extend_forward_end_encodeBlockAsm:
	ADDL  R11, CX
	MOVL  CX, BX
	SUBL  SI, BX
	MOVL  16(SP), SI
	TESTL DI, DI
	JZ    repeat_as_copy_encodeBlockAsm

	// emitRepeat
emit_repeat_again_match_repeat_encodeBlockAsm:
	MOVL BX, DI
	LEAL -4(BX), BX
	CMPL DI, $0x08
	JBE  repeat_two_match_repeat_encodeBlockAsm
	CMPL DI, $0x0c
	JAE  cant_repeat_two_offset_match_repeat_encodeBlockAsm
	CMPL SI, $0x00000800
	JB   repeat_two_offset_match_repeat_encodeBlockAsm

cant_repeat_two_offset_match_repeat_encodeBlockAsm:
	CMPL BX, $0x00000104
	JB   repeat_three_match_repeat_encodeBlockAsm
	CMPL BX, $0x00010100
	JB   repeat_four_match_repeat_encodeBlockAsm
	CMPL BX, $0x0100ffff
	JB   repeat_five_match_repeat_encodeBlockAsm
	LEAL -16842747(BX), BX
	MOVL $0xfffb001d, (AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	JMP  emit_repeat_again_match_repeat_encodeBlockAsm

repeat_five_match_repeat_encodeBlockAsm:
	LEAL -65536(BX), BX
	MOVL BX, SI
	MOVW $0x001d, (AX)
	MOVW BX, 2(AX)
	SARL $0x10, SI
	MOVB SI, 4(AX)
	ADDQ $0x05, AX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_four_match_repeat_encodeBlockAsm:
	LEAL -256(BX), BX
	MOVW $0x0019, (AX)
	MOVW BX, 2(AX)
	ADDQ $0x04, AX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_three_match_repeat_encodeBlockAsm:
	LEAL -4(BX), BX
	MOVW $0x0015, (AX)
	MOVB BL, 2(AX)
	ADDQ $0x03, AX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_two_match_repeat_encodeBlockAsm:
	SHLL $0x02, BX
	ORL  $0x01, BX
	MOVW BX, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_two_offset_match_repeat_encodeBlockAsm:
	XORQ DI, DI
	LEAL 1(DI)(BX*4), BX
	MOVB SI, 1(AX)
	SARL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, BX
	MOVB BL, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_as_copy_encodeBlockAsm:
	// emitCopy
	CMPL SI, $0x00010000
	JB   two_byte_offset_repeat_as_copy_encodeBlockAsm
	CMPL BX, $0x40
	JBE  four_bytes_remain_repeat_as_copy_encodeBlockAsm
	MOVB $0xff, (AX)
	MOVL SI, 1(AX)
	LEAL -64(BX), BX
	ADDQ $0x05, AX
	CMPL BX, $0x04
	JB   four_bytes_remain_repeat_as_copy_encodeBlockAsm

	// emitRepeat
emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy:
	MOVL BX, DI
	LEAL -4(BX), BX
	CMPL DI, $0x08
	JBE  repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy
	CMPL DI, $0x0c
	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy
	CMPL SI, $0x00000800
	JB   repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy

cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy:
	CMPL BX, $0x00000104
	JB   repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy
	CMPL BX, $0x00010100
	JB   repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy
	CMPL BX, $0x0100ffff
	JB   repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy
	LEAL -16842747(BX), BX
	MOVL $0xfffb001d, (AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	JMP  emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy

repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy:
	LEAL -65536(BX), BX
	MOVL BX, SI
	MOVW $0x001d, (AX)
	MOVW BX, 2(AX)
	SARL $0x10, SI
	MOVB SI, 4(AX)
	ADDQ $0x05, AX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy:
	LEAL -256(BX), BX
	MOVW $0x0019, (AX)
	MOVW BX, 2(AX)
	ADDQ $0x04, AX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy:
	LEAL -4(BX), BX
	MOVW $0x0015, (AX)
	MOVB BL, 2(AX)
	ADDQ $0x03, AX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy:
	SHLL $0x02, BX
	ORL  $0x01, BX
	MOVW BX, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy:
	XORQ DI, DI
	LEAL 1(DI)(BX*4), BX
	MOVB SI, 1(AX)
	SARL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, BX
	MOVB BL, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm

four_bytes_remain_repeat_as_copy_encodeBlockAsm:
	TESTL BX, BX
	JZ    repeat_end_emit_encodeBlockAsm
	XORL  DI, DI
	LEAL  -1(DI)(BX*4), BX
	MOVB  BL, (AX)
	MOVL  SI, 1(AX)
	ADDQ  $0x05, AX
	JMP   repeat_end_emit_encodeBlockAsm

two_byte_offset_repeat_as_copy_encodeBlockAsm:
	CMPL BX, $0x40
	JBE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm
	CMPL SI, $0x00000800
	JAE  long_offset_short_repeat_as_copy_encodeBlockAsm
	MOVL $0x00000001, DI
	LEAL 16(DI), DI
	MOVB SI, 1(AX)
	MOVL SI, R8
	SHRL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, DI
	MOVB DI, (AX)
	ADDQ $0x02, AX
	SUBL $0x08, BX

	// emitRepeat
	LEAL -4(BX), BX
	JMP  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b

emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
	MOVL BX, DI
	LEAL -4(BX), BX
	CMPL DI, $0x08
	JBE  repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
	CMPL DI, $0x0c
	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
	CMPL SI, $0x00000800
	JB   repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b

cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
	CMPL BX, $0x00000104
	JB   repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
	CMPL BX, $0x00010100
	JB   repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
	CMPL BX, $0x0100ffff
	JB   repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
	LEAL -16842747(BX), BX
	MOVL $0xfffb001d, (AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	JMP  emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b

repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
	LEAL -65536(BX), BX
	MOVL BX, SI
	MOVW $0x001d, (AX)
	MOVW BX, 2(AX)
	SARL $0x10, SI
	MOVB SI, 4(AX)
	ADDQ $0x05, AX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
	LEAL -256(BX), BX
	MOVW $0x0019, (AX)
	MOVW BX, 2(AX)
	ADDQ $0x04, AX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
	LEAL -4(BX), BX
	MOVW $0x0015, (AX)
	MOVB BL, 2(AX)
	ADDQ $0x03, AX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
	SHLL $0x02, BX
	ORL  $0x01, BX
	MOVW BX, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
	XORQ DI, DI
	LEAL 1(DI)(BX*4), BX
	MOVB SI, 1(AX)
	SARL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, BX
	MOVB BL, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm

long_offset_short_repeat_as_copy_encodeBlockAsm:
	MOVB $0xee, (AX)
	MOVW SI, 1(AX)
	LEAL -60(BX), BX
	ADDQ $0x03, AX

	// emitRepeat
emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short:
	MOVL BX, DI
	LEAL -4(BX), BX
	CMPL DI, $0x08
	JBE  repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short
	CMPL DI, $0x0c
	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short
	CMPL SI, $0x00000800
	JB   repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short

cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short:
	CMPL BX, $0x00000104
	JB   repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short
	CMPL BX, $0x00010100
	JB   repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short
	CMPL BX, $0x0100ffff
	JB   repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short
	LEAL -16842747(BX), BX
	MOVL $0xfffb001d, (AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	JMP  emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short

repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short:
	LEAL -65536(BX), BX
	MOVL BX, SI
	MOVW $0x001d, (AX)
	MOVW BX, 2(AX)
	SARL $0x10, SI
	MOVB SI, 4(AX)
	ADDQ $0x05, AX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short:
	LEAL -256(BX), BX
	MOVW $0x0019, (AX)
	MOVW BX, 2(AX)
	ADDQ $0x04, AX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short:
	LEAL -4(BX), BX
	MOVW $0x0015, (AX)
	MOVB BL, 2(AX)
	ADDQ $0x03, AX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short:
	SHLL $0x02, BX
	ORL  $0x01, BX
	MOVW BX, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short:
	XORQ DI, DI
	LEAL 1(DI)(BX*4), BX
	MOVB SI, 1(AX)
	SARL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, BX
	MOVB BL, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm

two_byte_offset_short_repeat_as_copy_encodeBlockAsm:
	MOVL BX, DI
	SHLL $0x02, DI
	CMPL BX, $0x0c
	JAE  emit_copy_three_repeat_as_copy_encodeBlockAsm
	CMPL SI, $0x00000800
	JAE  emit_copy_three_repeat_as_copy_encodeBlockAsm
	LEAL -15(DI), DI
	MOVB SI, 1(AX)
	SHRL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, DI
	MOVB DI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm

emit_copy_three_repeat_as_copy_encodeBlockAsm:
	LEAL -2(DI), DI
	MOVB DI, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX

repeat_end_emit_encodeBlockAsm:
	MOVL CX, 12(SP)
	JMP  search_loop_encodeBlockAsm

no_repeat_found_encodeBlockAsm:
	CMPL (DX)(BX*1), SI
	JEQ  candidate_match_encodeBlockAsm
	SHRQ $0x08, SI
	MOVL 24(SP)(R9*4), BX
	LEAL 2(CX), R8
	CMPL (DX)(DI*1), SI
	JEQ  candidate2_match_encodeBlockAsm
	MOVL R8, 24(SP)(R9*4)
	SHRQ $0x08, SI
	CMPL (DX)(BX*1), SI
	JEQ  candidate3_match_encodeBlockAsm
	MOVL 20(SP), CX
	JMP  search_loop_encodeBlockAsm

candidate3_match_encodeBlockAsm:
	ADDL $0x02, CX
	JMP  candidate_match_encodeBlockAsm

candidate2_match_encodeBlockAsm:
	MOVL R8, 24(SP)(R9*4)
	INCL CX
	MOVL DI, BX

candidate_match_encodeBlockAsm:
	MOVL  12(SP), SI
	TESTL BX, BX
	JZ    match_extend_back_end_encodeBlockAsm

match_extend_back_loop_encodeBlockAsm:
	CMPL CX, SI
	JBE  match_extend_back_end_encodeBlockAsm
	MOVB -1(DX)(BX*1), DI
	MOVB -1(DX)(CX*1), R8
	CMPB DI, R8
	JNE  match_extend_back_end_encodeBlockAsm
	LEAL -1(CX), CX
	DECL BX
	JZ   match_extend_back_end_encodeBlockAsm
	JMP  match_extend_back_loop_encodeBlockAsm

match_extend_back_end_encodeBlockAsm:
	MOVL CX, SI
	SUBL 12(SP), SI
	LEAQ 5(AX)(SI*1), SI
	CMPQ SI, (SP)
	JB   match_dst_size_check_encodeBlockAsm
	MOVQ $0x00000000, ret+48(FP)
	RET

match_dst_size_check_encodeBlockAsm:
	MOVL CX, SI
	MOVL 12(SP), DI
	CMPL DI, SI
	JEQ  emit_literal_done_match_emit_encodeBlockAsm
	MOVL SI, R8
	MOVL SI, 12(SP)
	LEAQ (DX)(DI*1), SI
	SUBL DI, R8
	LEAL -1(R8), DI
	CMPL DI, $0x3c
	JB   one_byte_match_emit_encodeBlockAsm
	CMPL DI, $0x00000100
	JB   two_bytes_match_emit_encodeBlockAsm
	CMPL DI, $0x00010000
	JB   three_bytes_match_emit_encodeBlockAsm
	CMPL DI, $0x01000000
	JB   four_bytes_match_emit_encodeBlockAsm
	MOVB $0xfc, (AX)
	MOVL DI, 1(AX)
	ADDQ $0x05, AX
	JMP  memmove_long_match_emit_encodeBlockAsm

four_bytes_match_emit_encodeBlockAsm:
	MOVL DI, R9
	SHRL $0x10, R9
	MOVB $0xf8, (AX)
	MOVW DI, 1(AX)
	MOVB R9, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_match_emit_encodeBlockAsm

three_bytes_match_emit_encodeBlockAsm:
	MOVB $0xf4, (AX)
	MOVW DI, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_encodeBlockAsm

two_bytes_match_emit_encodeBlockAsm:
	MOVB $0xf0, (AX)
	MOVB DI, 1(AX)
	ADDQ $0x02, AX
	CMPL DI, $0x40
	JB   memmove_match_emit_encodeBlockAsm
	JMP  memmove_long_match_emit_encodeBlockAsm

one_byte_match_emit_encodeBlockAsm:
	SHLB $0x02, DI
	MOVB DI, (AX)
	ADDQ $0x01, AX

memmove_match_emit_encodeBlockAsm:
	LEAQ (AX)(R8*1), DI

	// genMemMoveShort
	CMPQ R8, $0x08
	JBE  emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64

emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8:
	MOVQ (SI), R9
	MOVQ R9, (AX)
	JMP  memmove_end_copy_match_emit_encodeBlockAsm

emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16:
	MOVQ (SI), R9
	MOVQ -8(SI)(R8*1), SI
	MOVQ R9, (AX)
	MOVQ SI, -8(AX)(R8*1)
	JMP  memmove_end_copy_match_emit_encodeBlockAsm

emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32:
	MOVOU (SI), X0
	MOVOU -16(SI)(R8*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R8*1)
	JMP   memmove_end_copy_match_emit_encodeBlockAsm

emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64:
	MOVOU (SI), X0
	MOVOU 16(SI), X1
	MOVOU -32(SI)(R8*1), X2
	MOVOU -16(SI)(R8*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)

memmove_end_copy_match_emit_encodeBlockAsm:
	MOVQ DI, AX
	JMP  emit_literal_done_match_emit_encodeBlockAsm

memmove_long_match_emit_encodeBlockAsm:
	LEAQ (AX)(R8*1), DI

	// genMemMoveLong
	MOVOU (SI), X0
	MOVOU 16(SI), X1
	MOVOU -32(SI)(R8*1), X2
	MOVOU -16(SI)(R8*1), X3
	MOVQ  R8, R10
	SHRQ  $0x05, R10
	MOVQ  AX, R9
	ANDL  $0x0000001f, R9
	MOVQ  $0x00000040, R11
	SUBQ  R9, R11
	DECQ  R10
	JA    emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32
	LEAQ  -32(SI)(R11*1), R9
	LEAQ  -32(AX)(R11*1), R12

emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back:
	MOVOU (R9), X4
	MOVOU 16(R9), X5
	MOVOA X4, (R12)
	MOVOA X5, 16(R12)
	ADDQ  $0x20, R12
	ADDQ  $0x20, R9
	ADDQ  $0x20, R11
	DECQ  R10
	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32:
	MOVOU -32(SI)(R11*1), X4
	MOVOU -16(SI)(R11*1), X5
	MOVOA X4, -32(AX)(R11*1)
	MOVOA X5, -16(AX)(R11*1)
	ADDQ  $0x20, R11
	CMPQ  R8, R11
	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)
	MOVQ  DI, AX

emit_literal_done_match_emit_encodeBlockAsm:
match_nolit_loop_encodeBlockAsm:
	MOVL CX, SI
	SUBL BX, SI
	MOVL SI, 16(SP)
	ADDL $0x04, CX
	ADDL $0x04, BX
	MOVQ src_len+32(FP), SI
	SUBL CX, SI
	LEAQ (DX)(CX*1), DI
	LEAQ (DX)(BX*1), BX

	// matchLen
	XORL R9, R9

matchlen_loopback_16_match_nolit_encodeBlockAsm:
	CMPL SI, $0x10
	JB   matchlen_match8_match_nolit_encodeBlockAsm
	MOVQ (DI)(R9*1), R8
	MOVQ 8(DI)(R9*1), R10
	XORQ (BX)(R9*1), R8
	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm
	XORQ 8(BX)(R9*1), R10
	JNZ  matchlen_bsf_16match_nolit_encodeBlockAsm
	LEAL -16(SI), SI
	LEAL 16(R9), R9
	JMP  matchlen_loopback_16_match_nolit_encodeBlockAsm

matchlen_bsf_16match_nolit_encodeBlockAsm:
#ifdef GOAMD64_v3
	TZCNTQ R10, R10

#else
	BSFQ R10, R10

#endif
	SARQ $0x03, R10
	LEAL 8(R9)(R10*1), R9
	JMP  match_nolit_end_encodeBlockAsm

matchlen_match8_match_nolit_encodeBlockAsm:
	CMPL SI, $0x08
	JB   matchlen_match4_match_nolit_encodeBlockAsm
	MOVQ (DI)(R9*1), R8
	XORQ (BX)(R9*1), R8
	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm
	LEAL -8(SI), SI
	LEAL 8(R9), R9
	JMP  matchlen_match4_match_nolit_encodeBlockAsm

matchlen_bsf_8_match_nolit_encodeBlockAsm:
#ifdef GOAMD64_v3
	TZCNTQ R8, R8

#else
	BSFQ R8, R8

#endif
	SARQ $0x03, R8
	LEAL (R9)(R8*1), R9
	JMP  match_nolit_end_encodeBlockAsm

matchlen_match4_match_nolit_encodeBlockAsm:
	CMPL SI, $0x04
	JB   matchlen_match2_match_nolit_encodeBlockAsm
	MOVL (DI)(R9*1), R8
	CMPL (BX)(R9*1), R8
	JNE  matchlen_match2_match_nolit_encodeBlockAsm
	LEAL -4(SI), SI
	LEAL 4(R9), R9

matchlen_match2_match_nolit_encodeBlockAsm:
	CMPL SI, $0x01
	JE   matchlen_match1_match_nolit_encodeBlockAsm
	JB   match_nolit_end_encodeBlockAsm
	MOVW (DI)(R9*1), R8
	CMPW (BX)(R9*1), R8
	JNE  matchlen_match1_match_nolit_encodeBlockAsm
	LEAL 2(R9), R9
	SUBL $0x02, SI
	JZ   match_nolit_end_encodeBlockAsm

matchlen_match1_match_nolit_encodeBlockAsm:
	MOVB (DI)(R9*1), R8
	CMPB (BX)(R9*1), R8
	JNE  match_nolit_end_encodeBlockAsm
	LEAL 1(R9), R9

match_nolit_end_encodeBlockAsm:
	ADDL R9, CX
	MOVL 16(SP), BX
	ADDL $0x04, R9
	MOVL CX, 12(SP)

	// emitCopy
	CMPL BX, $0x00010000
	JB   two_byte_offset_match_nolit_encodeBlockAsm
	CMPL R9, $0x40
	JBE  four_bytes_remain_match_nolit_encodeBlockAsm
	MOVB $0xff, (AX)
	MOVL BX, 1(AX)
	LEAL -64(R9), R9
	ADDQ $0x05, AX
	CMPL R9, $0x04
	JB   four_bytes_remain_match_nolit_encodeBlockAsm

	// emitRepeat
emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy:
	MOVL R9, SI
	LEAL -4(R9), R9
	CMPL SI, $0x08
	JBE  repeat_two_match_nolit_encodeBlockAsm_emit_copy
	CMPL SI, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy
	CMPL BX, $0x00000800
	JB   repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy

cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy:
	CMPL R9, $0x00000104
	JB   repeat_three_match_nolit_encodeBlockAsm_emit_copy
	CMPL R9, $0x00010100
	JB   repeat_four_match_nolit_encodeBlockAsm_emit_copy
	CMPL R9, $0x0100ffff
	JB   repeat_five_match_nolit_encodeBlockAsm_emit_copy
	LEAL -16842747(R9), R9
	MOVL $0xfffb001d, (AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	JMP  emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy

repeat_five_match_nolit_encodeBlockAsm_emit_copy:
	LEAL -65536(R9), R9
	MOVL R9, BX
	MOVW $0x001d, (AX)
	MOVW R9, 2(AX)
	SARL $0x10, BX
	MOVB BL, 4(AX)
	ADDQ $0x05, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

repeat_four_match_nolit_encodeBlockAsm_emit_copy:
	LEAL -256(R9), R9
	MOVW $0x0019, (AX)
	MOVW R9, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

repeat_three_match_nolit_encodeBlockAsm_emit_copy:
	LEAL -4(R9), R9
	MOVW $0x0015, (AX)
	MOVB R9, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

repeat_two_match_nolit_encodeBlockAsm_emit_copy:
	SHLL $0x02, R9
	ORL  $0x01, R9
	MOVW R9, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy:
	XORQ SI, SI
	LEAL 1(SI)(R9*4), R9
	MOVB BL, 1(AX)
	SARL $0x08, BX
	SHLL $0x05, BX
	ORL  BX, R9
	MOVB R9, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

four_bytes_remain_match_nolit_encodeBlockAsm:
	TESTL R9, R9
	JZ    match_nolit_emitcopy_end_encodeBlockAsm
	XORL  SI, SI
	LEAL  -1(SI)(R9*4), R9
	MOVB  R9, (AX)
	MOVL  BX, 1(AX)
	ADDQ  $0x05, AX
	JMP   match_nolit_emitcopy_end_encodeBlockAsm

two_byte_offset_match_nolit_encodeBlockAsm:
	CMPL R9, $0x40
	JBE  two_byte_offset_short_match_nolit_encodeBlockAsm
	CMPL BX, $0x00000800
	JAE  long_offset_short_match_nolit_encodeBlockAsm
	MOVL $0x00000001, SI
	LEAL 16(SI), SI
	MOVB BL, 1(AX)
	MOVL BX, DI
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (AX)
	ADDQ $0x02, AX
	SUBL $0x08, R9

	// emitRepeat
	LEAL -4(R9), R9
	JMP  cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b

emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b:
	MOVL R9, SI
	LEAL -4(R9), R9
	CMPL SI, $0x08
	JBE  repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b
	CMPL SI, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b
	CMPL BX, $0x00000800
	JB   repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b

cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b:
	CMPL R9, $0x00000104
	JB   repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b
	CMPL R9, $0x00010100
	JB   repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b
	CMPL R9, $0x0100ffff
	JB   repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b
	LEAL -16842747(R9), R9
	MOVL $0xfffb001d, (AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	JMP  emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b

repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b:
	LEAL -65536(R9), R9
	MOVL R9, BX
	MOVW $0x001d, (AX)
	MOVW R9, 2(AX)
	SARL $0x10, BX
	MOVB BL, 4(AX)
	ADDQ $0x05, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b:
	LEAL -256(R9), R9
	MOVW $0x0019, (AX)
	MOVW R9, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b:
	LEAL -4(R9), R9
	MOVW $0x0015, (AX)
	MOVB R9, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b:
	SHLL $0x02, R9
	ORL  $0x01, R9
	MOVW R9, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b:
	XORQ SI, SI
	LEAL 1(SI)(R9*4), R9
	MOVB BL, 1(AX)
	SARL $0x08, BX
	SHLL $0x05, BX
	ORL  BX, R9
	MOVB R9, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

long_offset_short_match_nolit_encodeBlockAsm:
	MOVB $0xee, (AX)
	MOVW BX, 1(AX)
	LEAL -60(R9), R9
	ADDQ $0x03, AX

	// emitRepeat
emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short:
	MOVL R9, SI
	LEAL -4(R9), R9
	CMPL SI, $0x08
	JBE  repeat_two_match_nolit_encodeBlockAsm_emit_copy_short
	CMPL SI, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short
	CMPL BX, $0x00000800
	JB   repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short

cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short:
	CMPL R9, $0x00000104
	JB   repeat_three_match_nolit_encodeBlockAsm_emit_copy_short
	CMPL R9, $0x00010100
	JB   repeat_four_match_nolit_encodeBlockAsm_emit_copy_short
	CMPL R9, $0x0100ffff
	JB   repeat_five_match_nolit_encodeBlockAsm_emit_copy_short
	LEAL -16842747(R9), R9
	MOVL $0xfffb001d, (AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	JMP  emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short

repeat_five_match_nolit_encodeBlockAsm_emit_copy_short:
	LEAL -65536(R9), R9
	MOVL R9, BX
	MOVW $0x001d, (AX)
	MOVW R9, 2(AX)
	SARL $0x10, BX
	MOVB BL, 4(AX)
	ADDQ $0x05, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

repeat_four_match_nolit_encodeBlockAsm_emit_copy_short:
	LEAL -256(R9), R9
	MOVW $0x0019, (AX)
	MOVW R9, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

repeat_three_match_nolit_encodeBlockAsm_emit_copy_short:
	LEAL -4(R9), R9
	MOVW $0x0015, (AX)
	MOVB R9, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

repeat_two_match_nolit_encodeBlockAsm_emit_copy_short:
	SHLL $0x02, R9
	ORL  $0x01, R9
	MOVW R9, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short:
	XORQ SI, SI
	LEAL 1(SI)(R9*4), R9
	MOVB BL, 1(AX)
	SARL $0x08, BX
	SHLL $0x05, BX
	ORL  BX, R9
	MOVB R9, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

two_byte_offset_short_match_nolit_encodeBlockAsm:
	MOVL R9, SI
	SHLL $0x02, SI
	CMPL R9, $0x0c
	JAE  emit_copy_three_match_nolit_encodeBlockAsm
	CMPL BX, $0x00000800
	JAE  emit_copy_three_match_nolit_encodeBlockAsm
	LEAL -15(SI), SI
	MOVB BL, 1(AX)
	SHRL $0x08, BX
	SHLL $0x05, BX
	ORL  BX, SI
	MOVB SI, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

emit_copy_three_match_nolit_encodeBlockAsm:
	LEAL -2(SI), SI
	MOVB SI, (AX)
	MOVW BX, 1(AX)
	ADDQ $0x03, AX

match_nolit_emitcopy_end_encodeBlockAsm:
	CMPL CX, 8(SP)
	JAE  emit_remainder_encodeBlockAsm
	MOVQ -2(DX)(CX*1), SI
	CMPQ AX, (SP)
	JB   match_nolit_dst_ok_encodeBlockAsm
	MOVQ $0x00000000, ret+48(FP)
	RET

match_nolit_dst_ok_encodeBlockAsm:
	MOVQ  $0x0000cf1bbcdcbf9b, R8
	MOVQ  SI, DI
	SHRQ  $0x10, SI
	MOVQ  SI, BX
	SHLQ  $0x10, DI
	IMULQ R8, DI
	SHRQ  $0x32, DI
	SHLQ  $0x10, BX
	IMULQ R8, BX
	SHRQ  $0x32, BX
	LEAL  -2(CX), R8
	LEAQ  24(SP)(BX*4), R9
	MOVL  (R9), BX
	MOVL  R8, 24(SP)(DI*4)
	MOVL  CX, (R9)
	CMPL  (DX)(BX*1), SI
	JEQ   match_nolit_loop_encodeBlockAsm
	INCL  CX
	JMP   search_loop_encodeBlockAsm

emit_remainder_encodeBlockAsm:
	MOVQ src_len+32(FP), CX
	SUBL 12(SP), CX
	LEAQ 5(AX)(CX*1), CX
	CMPQ CX, (SP)
	JB   emit_remainder_ok_encodeBlockAsm
	MOVQ $0x00000000, ret+48(FP)
	RET

emit_remainder_ok_encodeBlockAsm:
	MOVQ src_len+32(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_encodeBlockAsm
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JB   one_byte_emit_remainder_encodeBlockAsm
	CMPL DX, $0x00000100
	JB   two_bytes_emit_remainder_encodeBlockAsm
	CMPL DX, $0x00010000
	JB   three_bytes_emit_remainder_encodeBlockAsm
	CMPL DX, $0x01000000
	JB   four_bytes_emit_remainder_encodeBlockAsm
	MOVB $0xfc, (AX)
	MOVL DX, 1(AX)
	ADDQ $0x05, AX
	JMP  memmove_long_emit_remainder_encodeBlockAsm

four_bytes_emit_remainder_encodeBlockAsm:
	MOVL DX, BX
	SHRL $0x10, BX
	MOVB $0xf8, (AX)
	MOVW DX, 1(AX)
	MOVB BL, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_emit_remainder_encodeBlockAsm

three_bytes_emit_remainder_encodeBlockAsm:
	MOVB $0xf4, (AX)
	MOVW DX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_encodeBlockAsm

two_bytes_emit_remainder_encodeBlockAsm:
	MOVB $0xf0, (AX)
	MOVB DL, 1(AX)
	ADDQ $0x02, AX
	CMPL DX, $0x40
	JB   memmove_emit_remainder_encodeBlockAsm
	JMP  memmove_long_emit_remainder_encodeBlockAsm

one_byte_emit_remainder_encodeBlockAsm:
	SHLB $0x02, DL
	MOVB DL, (AX)
	ADDQ $0x01, AX

memmove_emit_remainder_encodeBlockAsm:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x03
	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2
	JE   emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2:
	MOVB (CX), SI
	MOVB -1(CX)(BX*1), CL
	MOVB SI, (AX)
	MOVB CL, -1(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm

emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3:
	MOVW (CX), SI
	MOVB 2(CX), CL
	MOVW SI, (AX)
	MOVB CL, 2(AX)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm

emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7:
	MOVL (CX), SI
	MOVL -4(CX)(BX*1), CX
	MOVL SI, (AX)
	MOVL CX, -4(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm

emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(BX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm

emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(BX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm

emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)

memmove_end_copy_emit_remainder_encodeBlockAsm:
	MOVQ DX, AX
	JMP  emit_literal_done_emit_remainder_encodeBlockAsm

memmove_long_emit_remainder_encodeBlockAsm:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)
	MOVQ  DX, AX

emit_literal_done_emit_remainder_encodeBlockAsm:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ AX, ret+48(FP)
	RET

// func encodeBlockAsm4MB(dst []byte, src []byte) int
// Requires: BMI, SSE2
TEXT ·encodeBlockAsm4MB(SB), $65560-56
	MOVQ dst_base+0(FP), AX
	MOVQ $0x00000200, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_encodeBlockAsm4MB:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_encodeBlockAsm4MB
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), CX
	LEAQ  -9(CX), DX
	LEAQ  -8(CX), BX
	MOVL  BX, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  CX, 16(SP)
	MOVQ  src_base+24(FP), DX

search_loop_encodeBlockAsm4MB:
	MOVL  CX, BX
	SUBL  12(SP), BX
	SHRL  $0x06, BX
	LEAL  4(CX)(BX*1), BX
	CMPL  BX, 8(SP)
	JAE   emit_remainder_encodeBlockAsm4MB
	MOVQ  (DX)(CX*1), SI
	MOVL  BX, 20(SP)
	MOVQ  $0x0000cf1bbcdcbf9b, R8
	MOVQ  SI, R9
	MOVQ  SI, R10
	SHRQ  $0x08, R10
	SHLQ  $0x10, R9
	IMULQ R8, R9
	SHRQ  $0x32, R9
	SHLQ  $0x10, R10
	IMULQ R8, R10
	SHRQ  $0x32, R10
	MOVL  24(SP)(R9*4), BX
	MOVL  24(SP)(R10*4), DI
	MOVL  CX, 24(SP)(R9*4)
	LEAL  1(CX), R9
	MOVL  R9, 24(SP)(R10*4)
	MOVQ  SI, R9
	SHRQ  $0x10, R9
	SHLQ  $0x10, R9
	IMULQ R8, R9
	SHRQ  $0x32, R9
	MOVL  CX, R8
	SUBL  16(SP), R8
	MOVL  1(DX)(R8*1), R10
	MOVQ  SI, R8
	SHRQ  $0x08, R8
	CMPL  R8, R10
	JNE   no_repeat_found_encodeBlockAsm4MB
	LEAL  1(CX), SI
	MOVL  12(SP), DI
	MOVL  SI, BX
	SUBL  16(SP), BX
	JZ    repeat_extend_back_end_encodeBlockAsm4MB

repeat_extend_back_loop_encodeBlockAsm4MB:
	CMPL SI, DI
	JBE  repeat_extend_back_end_encodeBlockAsm4MB
	MOVB -1(DX)(BX*1), R8
	MOVB -1(DX)(SI*1), R9
	CMPB R8, R9
	JNE  repeat_extend_back_end_encodeBlockAsm4MB
	LEAL -1(SI), SI
	DECL BX
	JNZ  repeat_extend_back_loop_encodeBlockAsm4MB

repeat_extend_back_end_encodeBlockAsm4MB:
	MOVL SI, BX
	SUBL 12(SP), BX
	LEAQ 4(AX)(BX*1), BX
	CMPQ BX, (SP)
	JB   repeat_dst_size_check_encodeBlockAsm4MB
	MOVQ $0x00000000, ret+48(FP)
	RET

repeat_dst_size_check_encodeBlockAsm4MB:
	MOVL 12(SP), BX
	CMPL BX, SI
	JEQ  emit_literal_done_repeat_emit_encodeBlockAsm4MB
	MOVL SI, R8
	MOVL SI, 12(SP)
	LEAQ (DX)(BX*1), R9
	SUBL BX, R8
	LEAL -1(R8), BX
	CMPL BX, $0x3c
	JB   one_byte_repeat_emit_encodeBlockAsm4MB
	CMPL BX, $0x00000100
	JB   two_bytes_repeat_emit_encodeBlockAsm4MB
	CMPL BX, $0x00010000
	JB   three_bytes_repeat_emit_encodeBlockAsm4MB
	MOVL BX, R10
	SHRL $0x10, R10
	MOVB $0xf8, (AX)
	MOVW BX, 1(AX)
	MOVB R10, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_repeat_emit_encodeBlockAsm4MB

three_bytes_repeat_emit_encodeBlockAsm4MB:
	MOVB $0xf4, (AX)
	MOVW BX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_repeat_emit_encodeBlockAsm4MB

two_bytes_repeat_emit_encodeBlockAsm4MB:
	MOVB $0xf0, (AX)
	MOVB BL, 1(AX)
	ADDQ $0x02, AX
	CMPL BX, $0x40
	JB   memmove_repeat_emit_encodeBlockAsm4MB
	JMP  memmove_long_repeat_emit_encodeBlockAsm4MB

one_byte_repeat_emit_encodeBlockAsm4MB:
	SHLB $0x02, BL
	MOVB BL, (AX)
	ADDQ $0x01, AX

memmove_repeat_emit_encodeBlockAsm4MB:
	LEAQ (AX)(R8*1), BX

	// genMemMoveShort
	CMPQ R8, $0x08
	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32
	JMP  emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64

emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8:
	MOVQ (R9), R10
	MOVQ R10, (AX)
	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm4MB

emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16:
	MOVQ (R9), R10
	MOVQ -8(R9)(R8*1), R9
	MOVQ R10, (AX)
	MOVQ R9, -8(AX)(R8*1)
	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm4MB

emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32:
	MOVOU (R9), X0
	MOVOU -16(R9)(R8*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R8*1)
	JMP   memmove_end_copy_repeat_emit_encodeBlockAsm4MB

emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64:
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)

memmove_end_copy_repeat_emit_encodeBlockAsm4MB:
	MOVQ BX, AX
	JMP  emit_literal_done_repeat_emit_encodeBlockAsm4MB

memmove_long_repeat_emit_encodeBlockAsm4MB:
	LEAQ (AX)(R8*1), BX

	// genMemMoveLong
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVQ  R8, R11
	SHRQ  $0x05, R11
	MOVQ  AX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R12
	SUBQ  R10, R12
	DECQ  R11
	JA    emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
	LEAQ  -32(R9)(R12*1), R10
	LEAQ  -32(AX)(R12*1), R13

emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R13)
	MOVOA X5, 16(R13)
	ADDQ  $0x20, R13
	ADDQ  $0x20, R10
	ADDQ  $0x20, R12
	DECQ  R11
	JNA   emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back

emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32:
	MOVOU -32(R9)(R12*1), X4
	MOVOU -16(R9)(R12*1), X5
	MOVOA X4, -32(AX)(R12*1)
	MOVOA X5, -16(AX)(R12*1)
	ADDQ  $0x20, R12
	CMPQ  R8, R12
	JAE   emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)
	MOVQ  BX, AX

emit_literal_done_repeat_emit_encodeBlockAsm4MB:
	ADDL $0x05, CX
	MOVL CX, BX
	SUBL 16(SP), BX
	MOVQ src_len+32(FP), R8
	SUBL CX, R8
	LEAQ (DX)(CX*1), R9
	LEAQ (DX)(BX*1), BX

	// matchLen
	XORL R11, R11

matchlen_loopback_16_repeat_extend_encodeBlockAsm4MB:
	CMPL R8, $0x10
	JB   matchlen_match8_repeat_extend_encodeBlockAsm4MB
	MOVQ (R9)(R11*1), R10
	MOVQ 8(R9)(R11*1), R12
	XORQ (BX)(R11*1), R10
	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm4MB
	XORQ 8(BX)(R11*1), R12
	JNZ  matchlen_bsf_16repeat_extend_encodeBlockAsm4MB
	LEAL -16(R8), R8
	LEAL 16(R11), R11
	JMP  matchlen_loopback_16_repeat_extend_encodeBlockAsm4MB

matchlen_bsf_16repeat_extend_encodeBlockAsm4MB:
#ifdef GOAMD64_v3
	TZCNTQ R12, R12

#else
	BSFQ R12, R12

#endif
	SARQ $0x03, R12
	LEAL 8(R11)(R12*1), R11
	JMP  repeat_extend_forward_end_encodeBlockAsm4MB

matchlen_match8_repeat_extend_encodeBlockAsm4MB:
	CMPL R8, $0x08
	JB   matchlen_match4_repeat_extend_encodeBlockAsm4MB
	MOVQ (R9)(R11*1), R10
	XORQ (BX)(R11*1), R10
	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm4MB
	LEAL -8(R8), R8
	LEAL 8(R11), R11
	JMP  matchlen_match4_repeat_extend_encodeBlockAsm4MB

matchlen_bsf_8_repeat_extend_encodeBlockAsm4MB:
#ifdef GOAMD64_v3
	TZCNTQ R10, R10

#else
	BSFQ R10, R10

#endif
	SARQ $0x03, R10
	LEAL (R11)(R10*1), R11
	JMP  repeat_extend_forward_end_encodeBlockAsm4MB

matchlen_match4_repeat_extend_encodeBlockAsm4MB:
	CMPL R8, $0x04
	JB   matchlen_match2_repeat_extend_encodeBlockAsm4MB
	MOVL (R9)(R11*1), R10
	CMPL (BX)(R11*1), R10
	JNE  matchlen_match2_repeat_extend_encodeBlockAsm4MB
	LEAL -4(R8), R8
	LEAL 4(R11), R11

matchlen_match2_repeat_extend_encodeBlockAsm4MB:
	CMPL R8, $0x01
	JE   matchlen_match1_repeat_extend_encodeBlockAsm4MB
	JB   repeat_extend_forward_end_encodeBlockAsm4MB
	MOVW (R9)(R11*1), R10
	CMPW (BX)(R11*1), R10
	JNE  matchlen_match1_repeat_extend_encodeBlockAsm4MB
	LEAL 2(R11), R11
	SUBL $0x02, R8
	JZ   repeat_extend_forward_end_encodeBlockAsm4MB

matchlen_match1_repeat_extend_encodeBlockAsm4MB:
	MOVB (R9)(R11*1), R10
	CMPB (BX)(R11*1), R10
	JNE  repeat_extend_forward_end_encodeBlockAsm4MB
	LEAL 1(R11), R11

repeat_extend_forward_end_encodeBlockAsm4MB:
	ADDL  R11, CX
	MOVL  CX, BX
	SUBL  SI, BX
	MOVL  16(SP), SI
	TESTL DI, DI
	JZ    repeat_as_copy_encodeBlockAsm4MB

	// emitRepeat
	MOVL BX, DI
	LEAL -4(BX), BX
	CMPL DI, $0x08
	JBE  repeat_two_match_repeat_encodeBlockAsm4MB
	CMPL DI, $0x0c
	JAE  cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB
	CMPL SI, $0x00000800
	JB   repeat_two_offset_match_repeat_encodeBlockAsm4MB

cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB:
	CMPL BX, $0x00000104
	JB   repeat_three_match_repeat_encodeBlockAsm4MB
	CMPL BX, $0x00010100
	JB   repeat_four_match_repeat_encodeBlockAsm4MB
	LEAL -65536(BX), BX
	MOVL BX, SI
	MOVW $0x001d, (AX)
	MOVW BX, 2(AX)
	SARL $0x10, SI
	MOVB SI, 4(AX)
	ADDQ $0x05, AX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_four_match_repeat_encodeBlockAsm4MB:
	LEAL -256(BX), BX
	MOVW $0x0019, (AX)
	MOVW BX, 2(AX)
	ADDQ $0x04, AX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_three_match_repeat_encodeBlockAsm4MB:
	LEAL -4(BX), BX
	MOVW $0x0015, (AX)
	MOVB BL, 2(AX)
	ADDQ $0x03, AX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_two_match_repeat_encodeBlockAsm4MB:
	SHLL $0x02, BX
	ORL  $0x01, BX
	MOVW BX, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_two_offset_match_repeat_encodeBlockAsm4MB:
	XORQ DI, DI
	LEAL 1(DI)(BX*4), BX
	MOVB SI, 1(AX)
	SARL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, BX
	MOVB BL, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_as_copy_encodeBlockAsm4MB:
	// emitCopy
	CMPL SI, $0x00010000
	JB   two_byte_offset_repeat_as_copy_encodeBlockAsm4MB
	CMPL BX, $0x40
	JBE  four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB
	MOVB $0xff, (AX)
	MOVL SI, 1(AX)
	LEAL -64(BX), BX
	ADDQ $0x05, AX
	CMPL BX, $0x04
	JB   four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB

	// emitRepeat
	MOVL BX, DI
	LEAL -4(BX), BX
	CMPL DI, $0x08
	JBE  repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy
	CMPL DI, $0x0c
	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy
	CMPL SI, $0x00000800
	JB   repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy

cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
	CMPL BX, $0x00000104
	JB   repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy
	CMPL BX, $0x00010100
	JB   repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy
	LEAL -65536(BX), BX
	MOVL BX, SI
	MOVW $0x001d, (AX)
	MOVW BX, 2(AX)
	SARL $0x10, SI
	MOVB SI, 4(AX)
	ADDQ $0x05, AX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
	LEAL -256(BX), BX
	MOVW $0x0019, (AX)
	MOVW BX, 2(AX)
	ADDQ $0x04, AX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
	LEAL -4(BX), BX
	MOVW $0x0015, (AX)
	MOVB BL, 2(AX)
	ADDQ $0x03, AX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
	SHLL $0x02, BX
	ORL  $0x01, BX
	MOVW BX, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
	XORQ DI, DI
	LEAL 1(DI)(BX*4), BX
	MOVB SI, 1(AX)
	SARL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, BX
	MOVB BL, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm4MB

four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB:
	TESTL BX, BX
	JZ    repeat_end_emit_encodeBlockAsm4MB
	XORL  DI, DI
	LEAL  -1(DI)(BX*4), BX
	MOVB  BL, (AX)
	MOVL  SI, 1(AX)
	ADDQ  $0x05, AX
	JMP   repeat_end_emit_encodeBlockAsm4MB

two_byte_offset_repeat_as_copy_encodeBlockAsm4MB:
	CMPL BX, $0x40
	JBE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB
	CMPL SI, $0x00000800
	JAE  long_offset_short_repeat_as_copy_encodeBlockAsm4MB
	MOVL $0x00000001, DI
	LEAL 16(DI), DI
	MOVB SI, 1(AX)
	SHRL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, DI
	MOVB DI, (AX)
	ADDQ $0x02, AX
	SUBL $0x08, BX

	// emitRepeat
	LEAL -4(BX), BX
	JMP  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
	MOVL BX, DI
	LEAL -4(BX), BX
	CMPL DI, $0x08
	JBE  repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
	CMPL DI, $0x0c
	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
	CMPL SI, $0x00000800
	JB   repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b

cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
	CMPL BX, $0x00000104
	JB   repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
	CMPL BX, $0x00010100
	JB   repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
	LEAL -65536(BX), BX
	MOVL BX, SI
	MOVW $0x001d, (AX)
	MOVW BX, 2(AX)
	SARL $0x10, SI
	MOVB SI, 4(AX)
	ADDQ $0x05, AX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
	LEAL -256(BX), BX
	MOVW $0x0019, (AX)
	MOVW BX, 2(AX)
	ADDQ $0x04, AX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
	LEAL -4(BX), BX
	MOVW $0x0015, (AX)
	MOVB BL, 2(AX)
	ADDQ $0x03, AX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
	SHLL $0x02, BX
	ORL  $0x01, BX
	MOVW BX, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
	XORQ DI, DI
	LEAL 1(DI)(BX*4), BX
	MOVB SI, 1(AX)
	SARL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, BX
	MOVB BL, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm4MB

long_offset_short_repeat_as_copy_encodeBlockAsm4MB:
	MOVB $0xee, (AX)
	MOVW SI, 1(AX)
	LEAL -60(BX), BX
	ADDQ $0x03, AX

	// emitRepeat
	MOVL BX, DI
	LEAL -4(BX), BX
	CMPL DI, $0x08
	JBE  repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
	CMPL DI, $0x0c
	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
	CMPL SI, $0x00000800
	JB   repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short

cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
	CMPL BX, $0x00000104
	JB   repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
	CMPL BX, $0x00010100
	JB   repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
	LEAL -65536(BX), BX
	MOVL BX, SI
	MOVW $0x001d, (AX)
	MOVW BX, 2(AX)
	SARL $0x10, SI
	MOVB SI, 4(AX)
	ADDQ $0x05, AX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
	LEAL -256(BX), BX
	MOVW $0x0019, (AX)
	MOVW BX, 2(AX)
	ADDQ $0x04, AX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
	LEAL -4(BX), BX
	MOVW $0x0015, (AX)
	MOVB BL, 2(AX)
	ADDQ $0x03, AX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
	SHLL $0x02, BX
	ORL  $0x01, BX
	MOVW BX, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
	XORQ DI, DI
	LEAL 1(DI)(BX*4), BX
	MOVB SI, 1(AX)
	SARL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, BX
	MOVB BL, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm4MB

two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB:
	MOVL BX, DI
	SHLL $0x02, DI
	CMPL BX, $0x0c
	JAE  emit_copy_three_repeat_as_copy_encodeBlockAsm4MB
	CMPL SI, $0x00000800
	JAE  emit_copy_three_repeat_as_copy_encodeBlockAsm4MB
	LEAL -15(DI), DI
	MOVB SI, 1(AX)
	SHRL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, DI
	MOVB DI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm4MB

emit_copy_three_repeat_as_copy_encodeBlockAsm4MB:
	LEAL -2(DI), DI
	MOVB DI, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX

repeat_end_emit_encodeBlockAsm4MB:
	MOVL CX, 12(SP)
	JMP  search_loop_encodeBlockAsm4MB

no_repeat_found_encodeBlockAsm4MB:
	CMPL (DX)(BX*1), SI
	JEQ  candidate_match_encodeBlockAsm4MB
	SHRQ $0x08, SI
	MOVL 24(SP)(R9*4), BX
	LEAL 2(CX), R8
	CMPL (DX)(DI*1), SI
	JEQ  candidate2_match_encodeBlockAsm4MB
	MOVL R8, 24(SP)(R9*4)
	SHRQ $0x08, SI
	CMPL (DX)(BX*1), SI
	JEQ  candidate3_match_encodeBlockAsm4MB
	MOVL 20(SP), CX
	JMP  search_loop_encodeBlockAsm4MB

candidate3_match_encodeBlockAsm4MB:
	ADDL $0x02, CX
	JMP  candidate_match_encodeBlockAsm4MB

candidate2_match_encodeBlockAsm4MB:
	MOVL R8, 24(SP)(R9*4)
	INCL CX
	MOVL DI, BX

candidate_match_encodeBlockAsm4MB:
	MOVL  12(SP), SI
	TESTL BX, BX
	JZ    match_extend_back_end_encodeBlockAsm4MB

match_extend_back_loop_encodeBlockAsm4MB:
	CMPL CX, SI
	JBE  match_extend_back_end_encodeBlockAsm4MB
	MOVB -1(DX)(BX*1), DI
	MOVB -1(DX)(CX*1), R8
	CMPB DI, R8
	JNE  match_extend_back_end_encodeBlockAsm4MB
	LEAL -1(CX), CX
	DECL BX
	JZ   match_extend_back_end_encodeBlockAsm4MB
	JMP  match_extend_back_loop_encodeBlockAsm4MB

match_extend_back_end_encodeBlockAsm4MB:
	MOVL CX, SI
	SUBL 12(SP), SI
	LEAQ 4(AX)(SI*1), SI
	CMPQ SI, (SP)
	JB   match_dst_size_check_encodeBlockAsm4MB
	MOVQ $0x00000000, ret+48(FP)
	RET

match_dst_size_check_encodeBlockAsm4MB:
	MOVL CX, SI
	MOVL 12(SP), DI
	CMPL DI, SI
	JEQ  emit_literal_done_match_emit_encodeBlockAsm4MB
	MOVL SI, R8
	MOVL SI, 12(SP)
	LEAQ (DX)(DI*1), SI
	SUBL DI, R8
	LEAL -1(R8), DI
	CMPL DI, $0x3c
	JB   one_byte_match_emit_encodeBlockAsm4MB
	CMPL DI, $0x00000100
	JB   two_bytes_match_emit_encodeBlockAsm4MB
	CMPL DI, $0x00010000
	JB   three_bytes_match_emit_encodeBlockAsm4MB
	MOVL DI, R9
	SHRL $0x10, R9
	MOVB $0xf8, (AX)
	MOVW DI, 1(AX)
	MOVB R9, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_match_emit_encodeBlockAsm4MB

three_bytes_match_emit_encodeBlockAsm4MB:
	MOVB $0xf4, (AX)
	MOVW DI, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_encodeBlockAsm4MB

two_bytes_match_emit_encodeBlockAsm4MB:
	MOVB $0xf0, (AX)
	MOVB DI, 1(AX)
	ADDQ $0x02, AX
	CMPL DI, $0x40
	JB   memmove_match_emit_encodeBlockAsm4MB
	JMP  memmove_long_match_emit_encodeBlockAsm4MB

one_byte_match_emit_encodeBlockAsm4MB:
	SHLB $0x02, DI
	MOVB DI, (AX)
	ADDQ $0x01, AX

memmove_match_emit_encodeBlockAsm4MB:
	LEAQ (AX)(R8*1), DI

	// genMemMoveShort
	CMPQ R8, $0x08
	JBE  emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64

emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8:
	MOVQ (SI), R9
	MOVQ R9, (AX)
	JMP  memmove_end_copy_match_emit_encodeBlockAsm4MB

emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16:
	MOVQ (SI), R9
	MOVQ -8(SI)(R8*1), SI
	MOVQ R9, (AX)
	MOVQ SI, -8(AX)(R8*1)
	JMP  memmove_end_copy_match_emit_encodeBlockAsm4MB

emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32:
	MOVOU (SI), X0
	MOVOU -16(SI)(R8*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R8*1)
	JMP   memmove_end_copy_match_emit_encodeBlockAsm4MB

emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64:
	MOVOU (SI), X0
	MOVOU 16(SI), X1
	MOVOU -32(SI)(R8*1), X2
	MOVOU -16(SI)(R8*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)

memmove_end_copy_match_emit_encodeBlockAsm4MB:
	MOVQ DI, AX
	JMP  emit_literal_done_match_emit_encodeBlockAsm4MB

memmove_long_match_emit_encodeBlockAsm4MB:
	LEAQ (AX)(R8*1), DI

	// genMemMoveLong
	MOVOU (SI), X0
	MOVOU 16(SI), X1
	MOVOU -32(SI)(R8*1), X2
	MOVOU -16(SI)(R8*1), X3
	MOVQ  R8, R10
	SHRQ  $0x05, R10
	MOVQ  AX, R9
	ANDL  $0x0000001f, R9
	MOVQ  $0x00000040, R11
	SUBQ  R9, R11
	DECQ  R10
	JA    emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
	LEAQ  -32(SI)(R11*1), R9
	LEAQ  -32(AX)(R11*1), R12

emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back:
	MOVOU (R9), X4
	MOVOU 16(R9), X5
	MOVOA X4, (R12)
	MOVOA X5, 16(R12)
	ADDQ  $0x20, R12
	ADDQ  $0x20, R9
	ADDQ  $0x20, R11
	DECQ  R10
	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32:
	MOVOU -32(SI)(R11*1), X4
	MOVOU -16(SI)(R11*1), X5
	MOVOA X4, -32(AX)(R11*1)
	MOVOA X5, -16(AX)(R11*1)
	ADDQ  $0x20, R11
	CMPQ  R8, R11
	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)
	MOVQ  DI, AX

emit_literal_done_match_emit_encodeBlockAsm4MB:
match_nolit_loop_encodeBlockAsm4MB:
	MOVL CX, SI
	SUBL BX, SI
	MOVL SI, 16(SP)
	ADDL $0x04, CX
	ADDL $0x04, BX
	MOVQ src_len+32(FP), SI
	SUBL CX, SI
	LEAQ (DX)(CX*1), DI
	LEAQ (DX)(BX*1), BX

	// matchLen
	XORL R9, R9

matchlen_loopback_16_match_nolit_encodeBlockAsm4MB:
	CMPL SI, $0x10
	JB   matchlen_match8_match_nolit_encodeBlockAsm4MB
	MOVQ (DI)(R9*1), R8
	MOVQ 8(DI)(R9*1), R10
	XORQ (BX)(R9*1), R8
	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm4MB
	XORQ 8(BX)(R9*1), R10
	JNZ  matchlen_bsf_16match_nolit_encodeBlockAsm4MB
	LEAL -16(SI), SI
	LEAL 16(R9), R9
	JMP  matchlen_loopback_16_match_nolit_encodeBlockAsm4MB

matchlen_bsf_16match_nolit_encodeBlockAsm4MB:
#ifdef GOAMD64_v3
	TZCNTQ R10, R10

#else
	BSFQ R10, R10

#endif
	SARQ $0x03, R10
	LEAL 8(R9)(R10*1), R9
	JMP  match_nolit_end_encodeBlockAsm4MB

matchlen_match8_match_nolit_encodeBlockAsm4MB:
	CMPL SI, $0x08
	JB   matchlen_match4_match_nolit_encodeBlockAsm4MB
	MOVQ (DI)(R9*1), R8
	XORQ (BX)(R9*1), R8
	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm4MB
	LEAL -8(SI), SI
	LEAL 8(R9), R9
	JMP  matchlen_match4_match_nolit_encodeBlockAsm4MB

matchlen_bsf_8_match_nolit_encodeBlockAsm4MB:
#ifdef GOAMD64_v3
	TZCNTQ R8, R8

#else
	BSFQ R8, R8

#endif
	SARQ $0x03, R8
	LEAL (R9)(R8*1), R9
	JMP  match_nolit_end_encodeBlockAsm4MB

matchlen_match4_match_nolit_encodeBlockAsm4MB:
	CMPL SI, $0x04
	JB   matchlen_match2_match_nolit_encodeBlockAsm4MB
	MOVL (DI)(R9*1), R8
	CMPL (BX)(R9*1), R8
	JNE  matchlen_match2_match_nolit_encodeBlockAsm4MB
	LEAL -4(SI), SI
	LEAL 4(R9), R9

matchlen_match2_match_nolit_encodeBlockAsm4MB:
	CMPL SI, $0x01
	JE   matchlen_match1_match_nolit_encodeBlockAsm4MB
	JB   match_nolit_end_encodeBlockAsm4MB
	MOVW (DI)(R9*1), R8
	CMPW (BX)(R9*1), R8
	JNE  matchlen_match1_match_nolit_encodeBlockAsm4MB
	LEAL 2(R9), R9
	SUBL $0x02, SI
	JZ   match_nolit_end_encodeBlockAsm4MB

matchlen_match1_match_nolit_encodeBlockAsm4MB:
	MOVB (DI)(R9*1), R8
	CMPB (BX)(R9*1), R8
	JNE  match_nolit_end_encodeBlockAsm4MB
	LEAL 1(R9), R9

match_nolit_end_encodeBlockAsm4MB:
	ADDL R9, CX
	MOVL 16(SP), BX
	ADDL $0x04, R9
	MOVL CX, 12(SP)

	// emitCopy
	CMPL BX, $0x00010000
	JB   two_byte_offset_match_nolit_encodeBlockAsm4MB
	CMPL R9, $0x40
	JBE  four_bytes_remain_match_nolit_encodeBlockAsm4MB
	MOVB $0xff, (AX)
	MOVL BX, 1(AX)
	LEAL -64(R9), R9
	ADDQ $0x05, AX
	CMPL R9, $0x04
	JB   four_bytes_remain_match_nolit_encodeBlockAsm4MB

	// emitRepeat
	MOVL R9, SI
	LEAL -4(R9), R9
	CMPL SI, $0x08
	JBE  repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy
	CMPL SI, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy
	CMPL BX, $0x00000800
	JB   repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy

cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy:
	CMPL R9, $0x00000104
	JB   repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy
	CMPL R9, $0x00010100
	JB   repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy
	LEAL -65536(R9), R9
	MOVL R9, BX
	MOVW $0x001d, (AX)
	MOVW R9, 2(AX)
	SARL $0x10, BX
	MOVB BL, 4(AX)
	ADDQ $0x05, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy:
	LEAL -256(R9), R9
	MOVW $0x0019, (AX)
	MOVW R9, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy:
	LEAL -4(R9), R9
	MOVW $0x0015, (AX)
	MOVB R9, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy:
	SHLL $0x02, R9
	ORL  $0x01, R9
	MOVW R9, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy:
	XORQ SI, SI
	LEAL 1(SI)(R9*4), R9
	MOVB BL, 1(AX)
	SARL $0x08, BX
	SHLL $0x05, BX
	ORL  BX, R9
	MOVB R9, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

four_bytes_remain_match_nolit_encodeBlockAsm4MB:
	TESTL R9, R9
	JZ    match_nolit_emitcopy_end_encodeBlockAsm4MB
	XORL  SI, SI
	LEAL  -1(SI)(R9*4), R9
	MOVB  R9, (AX)
	MOVL  BX, 1(AX)
	ADDQ  $0x05, AX
	JMP   match_nolit_emitcopy_end_encodeBlockAsm4MB

two_byte_offset_match_nolit_encodeBlockAsm4MB:
	CMPL R9, $0x40
	JBE  two_byte_offset_short_match_nolit_encodeBlockAsm4MB
	CMPL BX, $0x00000800
	JAE  long_offset_short_match_nolit_encodeBlockAsm4MB
	MOVL $0x00000001, SI
	LEAL 16(SI), SI
	MOVB BL, 1(AX)
	SHRL $0x08, BX
	SHLL $0x05, BX
	ORL  BX, SI
	MOVB SI, (AX)
	ADDQ $0x02, AX
	SUBL $0x08, R9

	// emitRepeat
	LEAL -4(R9), R9
	JMP  cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
	MOVL R9, SI
	LEAL -4(R9), R9
	CMPL SI, $0x08
	JBE  repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
	CMPL SI, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
	CMPL BX, $0x00000800
	JB   repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b

cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
	CMPL R9, $0x00000104
	JB   repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
	CMPL R9, $0x00010100
	JB   repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
	LEAL -65536(R9), R9
	MOVL R9, BX
	MOVW $0x001d, (AX)
	MOVW R9, 2(AX)
	SARL $0x10, BX
	MOVB BL, 4(AX)
	ADDQ $0x05, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
	LEAL -256(R9), R9
	MOVW $0x0019, (AX)
	MOVW R9, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
	LEAL -4(R9), R9
	MOVW $0x0015, (AX)
	MOVB R9, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
	SHLL $0x02, R9
	ORL  $0x01, R9
	MOVW R9, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
	XORQ SI, SI
	LEAL 1(SI)(R9*4), R9
	MOVB BL, 1(AX)
	SARL $0x08, BX
	SHLL $0x05, BX
	ORL  BX, R9
	MOVB R9, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

long_offset_short_match_nolit_encodeBlockAsm4MB:
	MOVB $0xee, (AX)
	MOVW BX, 1(AX)
	LEAL -60(R9), R9
	ADDQ $0x03, AX

	// emitRepeat
	MOVL R9, SI
	LEAL -4(R9), R9
	CMPL SI, $0x08
	JBE  repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short
	CMPL SI, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short
	CMPL BX, $0x00000800
	JB   repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short

cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short:
	CMPL R9, $0x00000104
	JB   repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short
	CMPL R9, $0x00010100
	JB   repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short
	LEAL -65536(R9), R9
	MOVL R9, BX
	MOVW $0x001d, (AX)
	MOVW R9, 2(AX)
	SARL $0x10, BX
	MOVB BL, 4(AX)
	ADDQ $0x05, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short:
	LEAL -256(R9), R9
	MOVW $0x0019, (AX)
	MOVW R9, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short:
	LEAL -4(R9), R9
	MOVW $0x0015, (AX)
	MOVB R9, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short:
	SHLL $0x02, R9
	ORL  $0x01, R9
	MOVW R9, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short:
	XORQ SI, SI
	LEAL 1(SI)(R9*4), R9
	MOVB BL, 1(AX)
	SARL $0x08, BX
	SHLL $0x05, BX
	ORL  BX, R9
	MOVB R9, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

two_byte_offset_short_match_nolit_encodeBlockAsm4MB:
	MOVL R9, SI
	SHLL $0x02, SI
	CMPL R9, $0x0c
	JAE  emit_copy_three_match_nolit_encodeBlockAsm4MB
	CMPL BX, $0x00000800
	JAE  emit_copy_three_match_nolit_encodeBlockAsm4MB
	LEAL -15(SI), SI
	MOVB BL, 1(AX)
	SHRL $0x08, BX
	SHLL $0x05, BX
	ORL  BX, SI
	MOVB SI, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

emit_copy_three_match_nolit_encodeBlockAsm4MB:
	LEAL -2(SI), SI
	MOVB SI, (AX)
	MOVW BX, 1(AX)
	ADDQ $0x03, AX

match_nolit_emitcopy_end_encodeBlockAsm4MB:
	CMPL CX, 8(SP)
	JAE  emit_remainder_encodeBlockAsm4MB
	MOVQ -2(DX)(CX*1), SI
	CMPQ AX, (SP)
	JB   match_nolit_dst_ok_encodeBlockAsm4MB
	MOVQ $0x00000000, ret+48(FP)
	RET

match_nolit_dst_ok_encodeBlockAsm4MB:
	MOVQ  $0x0000cf1bbcdcbf9b, R8
	MOVQ  SI, DI
	SHRQ  $0x10, SI
	MOVQ  SI, BX
	SHLQ  $0x10, DI
	IMULQ R8, DI
	SHRQ  $0x32, DI
	SHLQ  $0x10, BX
	IMULQ R8, BX
	SHRQ  $0x32, BX
	LEAL  -2(CX), R8
	LEAQ  24(SP)(BX*4), R9
	MOVL  (R9), BX
	MOVL  R8, 24(SP)(DI*4)
	MOVL  CX, (R9)
	CMPL  (DX)(BX*1), SI
	JEQ   match_nolit_loop_encodeBlockAsm4MB
	INCL  CX
	JMP   search_loop_encodeBlockAsm4MB

emit_remainder_encodeBlockAsm4MB:
	MOVQ src_len+32(FP), CX
	SUBL 12(SP), CX
	LEAQ 4(AX)(CX*1), CX
	CMPQ CX, (SP)
	JB   emit_remainder_ok_encodeBlockAsm4MB
	MOVQ $0x00000000, ret+48(FP)
	RET

emit_remainder_ok_encodeBlockAsm4MB:
	MOVQ src_len+32(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_encodeBlockAsm4MB
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JB   one_byte_emit_remainder_encodeBlockAsm4MB
	CMPL DX, $0x00000100
	JB   two_bytes_emit_remainder_encodeBlockAsm4MB
	CMPL DX, $0x00010000
	JB   three_bytes_emit_remainder_encodeBlockAsm4MB
	MOVL DX, BX
	SHRL $0x10, BX
	MOVB $0xf8, (AX)
	MOVW DX, 1(AX)
	MOVB BL, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_emit_remainder_encodeBlockAsm4MB

three_bytes_emit_remainder_encodeBlockAsm4MB:
	MOVB $0xf4, (AX)
	MOVW DX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_encodeBlockAsm4MB

two_bytes_emit_remainder_encodeBlockAsm4MB:
	MOVB $0xf0, (AX)
	MOVB DL, 1(AX)
	ADDQ $0x02, AX
	CMPL DX, $0x40
	JB   memmove_emit_remainder_encodeBlockAsm4MB
	JMP  memmove_long_emit_remainder_encodeBlockAsm4MB

one_byte_emit_remainder_encodeBlockAsm4MB:
	SHLB $0x02, DL
	MOVB DL, (AX)
	ADDQ $0x01, AX

memmove_emit_remainder_encodeBlockAsm4MB:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x03
	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2
	JE   emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2:
	MOVB (CX), SI
	MOVB -1(CX)(BX*1), CL
	MOVB SI, (AX)
	MOVB CL, -1(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm4MB

emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3:
	MOVW (CX), SI
	MOVB 2(CX), CL
	MOVW SI, (AX)
	MOVB CL, 2(AX)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm4MB

emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7:
	MOVL (CX), SI
	MOVL -4(CX)(BX*1), CX
	MOVL SI, (AX)
	MOVL CX, -4(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm4MB

emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(BX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm4MB

emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(BX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm4MB

emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)

memmove_end_copy_emit_remainder_encodeBlockAsm4MB:
	MOVQ DX, AX
	JMP  emit_literal_done_emit_remainder_encodeBlockAsm4MB

memmove_long_emit_remainder_encodeBlockAsm4MB:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)
	MOVQ  DX, AX

emit_literal_done_emit_remainder_encodeBlockAsm4MB:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ AX, ret+48(FP)
	RET

// func encodeBlockAsm12B(dst []byte, src []byte) int
// Requires: BMI, SSE2
TEXT ·encodeBlockAsm12B(SB), $16408-56
	MOVQ dst_base+0(FP), AX
	MOVQ $0x00000080, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_encodeBlockAsm12B:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_encodeBlockAsm12B
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), CX
	LEAQ  -9(CX), DX
	LEAQ  -8(CX), BX
	MOVL  BX, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  CX, 16(SP)
	MOVQ  src_base+24(FP), DX

search_loop_encodeBlockAsm12B:
	MOVL  CX, BX
	SUBL  12(SP), BX
	SHRL  $0x05, BX
	LEAL  4(CX)(BX*1), BX
	CMPL  BX, 8(SP)
	JAE   emit_remainder_encodeBlockAsm12B
	MOVQ  (DX)(CX*1), SI
	MOVL  BX, 20(SP)
	MOVQ  $0x000000cf1bbcdcbb, R8
	MOVQ  SI, R9
	MOVQ  SI, R10
	SHRQ  $0x08, R10
	SHLQ  $0x18, R9
	IMULQ R8, R9
	SHRQ  $0x34, R9
	SHLQ  $0x18, R10
	IMULQ R8, R10
	SHRQ  $0x34, R10
	MOVL  24(SP)(R9*4), BX
	MOVL  24(SP)(R10*4), DI
	MOVL  CX, 24(SP)(R9*4)
	LEAL  1(CX), R9
	MOVL  R9, 24(SP)(R10*4)
	MOVQ  SI, R9
	SHRQ  $0x10, R9
	SHLQ  $0x18, R9
	IMULQ R8, R9
	SHRQ  $0x34, R9
	MOVL  CX, R8
	SUBL  16(SP), R8
	MOVL  1(DX)(R8*1), R10
	MOVQ  SI, R8
	SHRQ  $0x08, R8
	CMPL  R8, R10
	JNE   no_repeat_found_encodeBlockAsm12B
	LEAL  1(CX), SI
	MOVL  12(SP), DI
	MOVL  SI, BX
	SUBL  16(SP), BX
	JZ    repeat_extend_back_end_encodeBlockAsm12B

repeat_extend_back_loop_encodeBlockAsm12B:
	CMPL SI, DI
	JBE  repeat_extend_back_end_encodeBlockAsm12B
	MOVB -1(DX)(BX*1), R8
	MOVB -1(DX)(SI*1), R9
	CMPB R8, R9
	JNE  repeat_extend_back_end_encodeBlockAsm12B
	LEAL -1(SI), SI
	DECL BX
	JNZ  repeat_extend_back_loop_encodeBlockAsm12B

repeat_extend_back_end_encodeBlockAsm12B:
	MOVL SI, BX
	SUBL 12(SP), BX
	LEAQ 3(AX)(BX*1), BX
	CMPQ BX, (SP)
	JB   repeat_dst_size_check_encodeBlockAsm12B
	MOVQ $0x00000000, ret+48(FP)
	RET

repeat_dst_size_check_encodeBlockAsm12B:
	MOVL 12(SP), BX
	CMPL BX, SI
	JEQ  emit_literal_done_repeat_emit_encodeBlockAsm12B
	MOVL SI, R8
	MOVL SI, 12(SP)
	LEAQ (DX)(BX*1), R9
	SUBL BX, R8
	LEAL -1(R8), BX
	CMPL BX, $0x3c
	JB   one_byte_repeat_emit_encodeBlockAsm12B
	CMPL BX, $0x00000100
	JB   two_bytes_repeat_emit_encodeBlockAsm12B
	JB   three_bytes_repeat_emit_encodeBlockAsm12B

three_bytes_repeat_emit_encodeBlockAsm12B:
	MOVB $0xf4, (AX)
	MOVW BX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_repeat_emit_encodeBlockAsm12B

two_bytes_repeat_emit_encodeBlockAsm12B:
	MOVB $0xf0, (AX)
	MOVB BL, 1(AX)
	ADDQ $0x02, AX
	CMPL BX, $0x40
	JB   memmove_repeat_emit_encodeBlockAsm12B
	JMP  memmove_long_repeat_emit_encodeBlockAsm12B

one_byte_repeat_emit_encodeBlockAsm12B:
	SHLB $0x02, BL
	MOVB BL, (AX)
	ADDQ $0x01, AX

memmove_repeat_emit_encodeBlockAsm12B:
	LEAQ (AX)(R8*1), BX

	// genMemMoveShort
	CMPQ R8, $0x08
	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32
	JMP  emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64

emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8:
	MOVQ (R9), R10
	MOVQ R10, (AX)
	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm12B

emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16:
	MOVQ (R9), R10
	MOVQ -8(R9)(R8*1), R9
	MOVQ R10, (AX)
	MOVQ R9, -8(AX)(R8*1)
	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm12B

emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32:
	MOVOU (R9), X0
	MOVOU -16(R9)(R8*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R8*1)
	JMP   memmove_end_copy_repeat_emit_encodeBlockAsm12B

emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64:
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)

memmove_end_copy_repeat_emit_encodeBlockAsm12B:
	MOVQ BX, AX
	JMP  emit_literal_done_repeat_emit_encodeBlockAsm12B

memmove_long_repeat_emit_encodeBlockAsm12B:
	LEAQ (AX)(R8*1), BX

	// genMemMoveLong
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVQ  R8, R11
	SHRQ  $0x05, R11
	MOVQ  AX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R12
	SUBQ  R10, R12
	DECQ  R11
	JA    emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
	LEAQ  -32(R9)(R12*1), R10
	LEAQ  -32(AX)(R12*1), R13

emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R13)
	MOVOA X5, 16(R13)
	ADDQ  $0x20, R13
	ADDQ  $0x20, R10
	ADDQ  $0x20, R12
	DECQ  R11
	JNA   emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back

emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32:
	MOVOU -32(R9)(R12*1), X4
	MOVOU -16(R9)(R12*1), X5
	MOVOA X4, -32(AX)(R12*1)
	MOVOA X5, -16(AX)(R12*1)
	ADDQ  $0x20, R12
	CMPQ  R8, R12
	JAE   emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)
	MOVQ  BX, AX

emit_literal_done_repeat_emit_encodeBlockAsm12B:
	ADDL $0x05, CX
	MOVL CX, BX
	SUBL 16(SP), BX
	MOVQ src_len+32(FP), R8
	SUBL CX, R8
	LEAQ (DX)(CX*1), R9
	LEAQ (DX)(BX*1), BX

	// matchLen
	XORL R11, R11

matchlen_loopback_16_repeat_extend_encodeBlockAsm12B:
	CMPL R8, $0x10
	JB   matchlen_match8_repeat_extend_encodeBlockAsm12B
	MOVQ (R9)(R11*1), R10
	MOVQ 8(R9)(R11*1), R12
	XORQ (BX)(R11*1), R10
	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm12B
	XORQ 8(BX)(R11*1), R12
	JNZ  matchlen_bsf_16repeat_extend_encodeBlockAsm12B
	LEAL -16(R8), R8
	LEAL 16(R11), R11
	JMP  matchlen_loopback_16_repeat_extend_encodeBlockAsm12B

matchlen_bsf_16repeat_extend_encodeBlockAsm12B:
#ifdef GOAMD64_v3
	TZCNTQ R12, R12

#else
	BSFQ R12, R12

#endif
	SARQ $0x03, R12
	LEAL 8(R11)(R12*1), R11
	JMP  repeat_extend_forward_end_encodeBlockAsm12B

matchlen_match8_repeat_extend_encodeBlockAsm12B:
	CMPL R8, $0x08
	JB   matchlen_match4_repeat_extend_encodeBlockAsm12B
	MOVQ (R9)(R11*1), R10
	XORQ (BX)(R11*1), R10
	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm12B
	LEAL -8(R8), R8
	LEAL 8(R11), R11
	JMP  matchlen_match4_repeat_extend_encodeBlockAsm12B

matchlen_bsf_8_repeat_extend_encodeBlockAsm12B:
#ifdef GOAMD64_v3
	TZCNTQ R10, R10

#else
	BSFQ R10, R10

#endif
	SARQ $0x03, R10
	LEAL (R11)(R10*1), R11
	JMP  repeat_extend_forward_end_encodeBlockAsm12B

matchlen_match4_repeat_extend_encodeBlockAsm12B:
	CMPL R8, $0x04
	JB   matchlen_match2_repeat_extend_encodeBlockAsm12B
	MOVL (R9)(R11*1), R10
	CMPL (BX)(R11*1), R10
	JNE  matchlen_match2_repeat_extend_encodeBlockAsm12B
	LEAL -4(R8), R8
	LEAL 4(R11), R11

matchlen_match2_repeat_extend_encodeBlockAsm12B:
	CMPL R8, $0x01
	JE   matchlen_match1_repeat_extend_encodeBlockAsm12B
	JB   repeat_extend_forward_end_encodeBlockAsm12B
	MOVW (R9)(R11*1), R10
	CMPW (BX)(R11*1), R10
	JNE  matchlen_match1_repeat_extend_encodeBlockAsm12B
	LEAL 2(R11), R11
	SUBL $0x02, R8
	JZ   repeat_extend_forward_end_encodeBlockAsm12B

matchlen_match1_repeat_extend_encodeBlockAsm12B:
	MOVB (R9)(R11*1), R10
	CMPB (BX)(R11*1), R10
	JNE  repeat_extend_forward_end_encodeBlockAsm12B
	LEAL 1(R11), R11

repeat_extend_forward_end_encodeBlockAsm12B:
	ADDL  R11, CX
	MOVL  CX, BX
	SUBL  SI, BX
	MOVL  16(SP), SI
	TESTL DI, DI
	JZ    repeat_as_copy_encodeBlockAsm12B

	// emitRepeat
	MOVL BX, DI
	LEAL -4(BX), BX
	CMPL DI, $0x08
	JBE  repeat_two_match_repeat_encodeBlockAsm12B
	CMPL DI, $0x0c
	JAE  cant_repeat_two_offset_match_repeat_encodeBlockAsm12B
	CMPL SI, $0x00000800
	JB   repeat_two_offset_match_repeat_encodeBlockAsm12B

cant_repeat_two_offset_match_repeat_encodeBlockAsm12B:
	CMPL BX, $0x00000104
	JB   repeat_three_match_repeat_encodeBlockAsm12B
	LEAL -256(BX), BX
	MOVW $0x0019, (AX)
	MOVW BX, 2(AX)
	ADDQ $0x04, AX
	JMP  repeat_end_emit_encodeBlockAsm12B

repeat_three_match_repeat_encodeBlockAsm12B:
	LEAL -4(BX), BX
	MOVW $0x0015, (AX)
	MOVB BL, 2(AX)
	ADDQ $0x03, AX
	JMP  repeat_end_emit_encodeBlockAsm12B

repeat_two_match_repeat_encodeBlockAsm12B:
	SHLL $0x02, BX
	ORL  $0x01, BX
	MOVW BX, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm12B

repeat_two_offset_match_repeat_encodeBlockAsm12B:
	XORQ DI, DI
	LEAL 1(DI)(BX*4), BX
	MOVB SI, 1(AX)
	SARL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, BX
	MOVB BL, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm12B

repeat_as_copy_encodeBlockAsm12B:
	// emitCopy
	CMPL BX, $0x40
	JBE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B
	CMPL SI, $0x00000800
	JAE  long_offset_short_repeat_as_copy_encodeBlockAsm12B
	MOVL $0x00000001, DI
	LEAL 16(DI), DI
	MOVB SI, 1(AX)
	SHRL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, DI
	MOVB DI, (AX)
	ADDQ $0x02, AX
	SUBL $0x08, BX

	// emitRepeat
	LEAL -4(BX), BX
	JMP  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
	MOVL BX, DI
	LEAL -4(BX), BX
	CMPL DI, $0x08
	JBE  repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
	CMPL DI, $0x0c
	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
	CMPL SI, $0x00000800
	JB   repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b

cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
	CMPL BX, $0x00000104
	JB   repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
	LEAL -256(BX), BX
	MOVW $0x0019, (AX)
	MOVW BX, 2(AX)
	ADDQ $0x04, AX
	JMP  repeat_end_emit_encodeBlockAsm12B

repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
	LEAL -4(BX), BX
	MOVW $0x0015, (AX)
	MOVB BL, 2(AX)
	ADDQ $0x03, AX
	JMP  repeat_end_emit_encodeBlockAsm12B

repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
	SHLL $0x02, BX
	ORL  $0x01, BX
	MOVW BX, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm12B

repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
	XORQ DI, DI
	LEAL 1(DI)(BX*4), BX
	MOVB SI, 1(AX)
	SARL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, BX
	MOVB BL, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm12B

long_offset_short_repeat_as_copy_encodeBlockAsm12B:
	MOVB $0xee, (AX)
	MOVW SI, 1(AX)
	LEAL -60(BX), BX
	ADDQ $0x03, AX

	// emitRepeat
	MOVL BX, DI
	LEAL -4(BX), BX
	CMPL DI, $0x08
	JBE  repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
	CMPL DI, $0x0c
	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
	CMPL SI, $0x00000800
	JB   repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short

cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
	CMPL BX, $0x00000104
	JB   repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
	LEAL -256(BX), BX
	MOVW $0x0019, (AX)
	MOVW BX, 2(AX)
	ADDQ $0x04, AX
	JMP  repeat_end_emit_encodeBlockAsm12B

repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
	LEAL -4(BX), BX
	MOVW $0x0015, (AX)
	MOVB BL, 2(AX)
	ADDQ $0x03, AX
	JMP  repeat_end_emit_encodeBlockAsm12B

repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
	SHLL $0x02, BX
	ORL  $0x01, BX
	MOVW BX, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm12B

repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
	XORQ DI, DI
	LEAL 1(DI)(BX*4), BX
	MOVB SI, 1(AX)
	SARL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, BX
	MOVB BL, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm12B

two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B:
	MOVL BX, DI
	SHLL $0x02, DI
	CMPL BX, $0x0c
	JAE  emit_copy_three_repeat_as_copy_encodeBlockAsm12B
	CMPL SI, $0x00000800
	JAE  emit_copy_three_repeat_as_copy_encodeBlockAsm12B
	LEAL -15(DI), DI
	MOVB SI, 1(AX)
	SHRL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, DI
	MOVB DI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm12B

emit_copy_three_repeat_as_copy_encodeBlockAsm12B:
	LEAL -2(DI), DI
	MOVB DI, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX

repeat_end_emit_encodeBlockAsm12B:
	MOVL CX, 12(SP)
	JMP  search_loop_encodeBlockAsm12B

no_repeat_found_encodeBlockAsm12B:
	CMPL (DX)(BX*1), SI
	JEQ  candidate_match_encodeBlockAsm12B
	SHRQ $0x08, SI
	MOVL 24(SP)(R9*4), BX
	LEAL 2(CX), R8
	CMPL (DX)(DI*1), SI
	JEQ  candidate2_match_encodeBlockAsm12B
	MOVL R8, 24(SP)(R9*4)
	SHRQ $0x08, SI
	CMPL (DX)(BX*1), SI
	JEQ  candidate3_match_encodeBlockAsm12B
	MOVL 20(SP), CX
	JMP  search_loop_encodeBlockAsm12B

candidate3_match_encodeBlockAsm12B:
	ADDL $0x02, CX
	JMP  candidate_match_encodeBlockAsm12B

candidate2_match_encodeBlockAsm12B:
	MOVL R8, 24(SP)(R9*4)
	INCL CX
	MOVL DI, BX

candidate_match_encodeBlockAsm12B:
	MOVL  12(SP), SI
	TESTL BX, BX
	JZ    match_extend_back_end_encodeBlockAsm12B

match_extend_back_loop_encodeBlockAsm12B:
	CMPL CX, SI
	JBE  match_extend_back_end_encodeBlockAsm12B
	MOVB -1(DX)(BX*1), DI
	MOVB -1(DX)(CX*1), R8
	CMPB DI, R8
	JNE  match_extend_back_end_encodeBlockAsm12B
	LEAL -1(CX), CX
	DECL BX
	JZ   match_extend_back_end_encodeBlockAsm12B
	JMP  match_extend_back_loop_encodeBlockAsm12B

match_extend_back_end_encodeBlockAsm12B:
	MOVL CX, SI
	SUBL 12(SP), SI
	LEAQ 3(AX)(SI*1), SI
	CMPQ SI, (SP)
	JB   match_dst_size_check_encodeBlockAsm12B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_dst_size_check_encodeBlockAsm12B:
	MOVL CX, SI
	MOVL 12(SP), DI
	CMPL DI, SI
	JEQ  emit_literal_done_match_emit_encodeBlockAsm12B
	MOVL SI, R8
	MOVL SI, 12(SP)
	LEAQ (DX)(DI*1), SI
	SUBL DI, R8
	LEAL -1(R8), DI
	CMPL DI, $0x3c
	JB   one_byte_match_emit_encodeBlockAsm12B
	CMPL DI, $0x00000100
	JB   two_bytes_match_emit_encodeBlockAsm12B
	JB   three_bytes_match_emit_encodeBlockAsm12B

three_bytes_match_emit_encodeBlockAsm12B:
	MOVB $0xf4, (AX)
	MOVW DI, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_encodeBlockAsm12B

two_bytes_match_emit_encodeBlockAsm12B:
	MOVB $0xf0, (AX)
	MOVB DI, 1(AX)
	ADDQ $0x02, AX
	CMPL DI, $0x40
	JB   memmove_match_emit_encodeBlockAsm12B
	JMP  memmove_long_match_emit_encodeBlockAsm12B

one_byte_match_emit_encodeBlockAsm12B:
	SHLB $0x02, DI
	MOVB DI, (AX)
	ADDQ $0x01, AX

memmove_match_emit_encodeBlockAsm12B:
	LEAQ (AX)(R8*1), DI

	// genMemMoveShort
	CMPQ R8, $0x08
	JBE  emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64

emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8:
	MOVQ (SI), R9
	MOVQ R9, (AX)
	JMP  memmove_end_copy_match_emit_encodeBlockAsm12B

emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16:
	MOVQ (SI), R9
	MOVQ -8(SI)(R8*1), SI
	MOVQ R9, (AX)
	MOVQ SI, -8(AX)(R8*1)
	JMP  memmove_end_copy_match_emit_encodeBlockAsm12B

emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32:
	MOVOU (SI), X0
	MOVOU -16(SI)(R8*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R8*1)
	JMP   memmove_end_copy_match_emit_encodeBlockAsm12B

emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64:
	MOVOU (SI), X0
	MOVOU 16(SI), X1
	MOVOU -32(SI)(R8*1), X2
	MOVOU -16(SI)(R8*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)

memmove_end_copy_match_emit_encodeBlockAsm12B:
	MOVQ DI, AX
	JMP  emit_literal_done_match_emit_encodeBlockAsm12B

memmove_long_match_emit_encodeBlockAsm12B:
	LEAQ (AX)(R8*1), DI

	// genMemMoveLong
	MOVOU (SI), X0
	MOVOU 16(SI), X1
	MOVOU -32(SI)(R8*1), X2
	MOVOU -16(SI)(R8*1), X3
	MOVQ  R8, R10
	SHRQ  $0x05, R10
	MOVQ  AX, R9
	ANDL  $0x0000001f, R9
	MOVQ  $0x00000040, R11
	SUBQ  R9, R11
	DECQ  R10
	JA    emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
	LEAQ  -32(SI)(R11*1), R9
	LEAQ  -32(AX)(R11*1), R12

emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back:
	MOVOU (R9), X4
	MOVOU 16(R9), X5
	MOVOA X4, (R12)
	MOVOA X5, 16(R12)
	ADDQ  $0x20, R12
	ADDQ  $0x20, R9
	ADDQ  $0x20, R11
	DECQ  R10
	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32:
	MOVOU -32(SI)(R11*1), X4
	MOVOU -16(SI)(R11*1), X5
	MOVOA X4, -32(AX)(R11*1)
	MOVOA X5, -16(AX)(R11*1)
	ADDQ  $0x20, R11
	CMPQ  R8, R11
	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)
	MOVQ  DI, AX

emit_literal_done_match_emit_encodeBlockAsm12B:
match_nolit_loop_encodeBlockAsm12B:
	MOVL CX, SI
	SUBL BX, SI
	MOVL SI, 16(SP)
	ADDL $0x04, CX
	ADDL $0x04, BX
	MOVQ src_len+32(FP), SI
	SUBL CX, SI
	LEAQ (DX)(CX*1), DI
	LEAQ (DX)(BX*1), BX

	// matchLen
	XORL R9, R9

matchlen_loopback_16_match_nolit_encodeBlockAsm12B:
	CMPL SI, $0x10
	JB   matchlen_match8_match_nolit_encodeBlockAsm12B
	MOVQ (DI)(R9*1), R8
	MOVQ 8(DI)(R9*1), R10
	XORQ (BX)(R9*1), R8
	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm12B
	XORQ 8(BX)(R9*1), R10
	JNZ  matchlen_bsf_16match_nolit_encodeBlockAsm12B
	LEAL -16(SI), SI
	LEAL 16(R9), R9
	JMP  matchlen_loopback_16_match_nolit_encodeBlockAsm12B

matchlen_bsf_16match_nolit_encodeBlockAsm12B:
#ifdef GOAMD64_v3
	TZCNTQ R10, R10

#else
	BSFQ R10, R10

#endif
	SARQ $0x03, R10
	LEAL 8(R9)(R10*1), R9
	JMP  match_nolit_end_encodeBlockAsm12B

matchlen_match8_match_nolit_encodeBlockAsm12B:
	CMPL SI, $0x08
	JB   matchlen_match4_match_nolit_encodeBlockAsm12B
	MOVQ (DI)(R9*1), R8
	XORQ (BX)(R9*1), R8
	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm12B
	LEAL -8(SI), SI
	LEAL 8(R9), R9
	JMP  matchlen_match4_match_nolit_encodeBlockAsm12B

matchlen_bsf_8_match_nolit_encodeBlockAsm12B:
#ifdef GOAMD64_v3
	TZCNTQ R8, R8

#else
	BSFQ R8, R8

#endif
	SARQ $0x03, R8
	LEAL (R9)(R8*1), R9
	JMP  match_nolit_end_encodeBlockAsm12B

matchlen_match4_match_nolit_encodeBlockAsm12B:
	CMPL SI, $0x04
	JB   matchlen_match2_match_nolit_encodeBlockAsm12B
	MOVL (DI)(R9*1), R8
	CMPL (BX)(R9*1), R8
	JNE  matchlen_match2_match_nolit_encodeBlockAsm12B
	LEAL -4(SI), SI
	LEAL 4(R9), R9

matchlen_match2_match_nolit_encodeBlockAsm12B:
	CMPL SI, $0x01
	JE   matchlen_match1_match_nolit_encodeBlockAsm12B
	JB   match_nolit_end_encodeBlockAsm12B
	MOVW (DI)(R9*1), R8
	CMPW (BX)(R9*1), R8
	JNE  matchlen_match1_match_nolit_encodeBlockAsm12B
	LEAL 2(R9), R9
	SUBL $0x02, SI
	JZ   match_nolit_end_encodeBlockAsm12B

matchlen_match1_match_nolit_encodeBlockAsm12B:
	MOVB (DI)(R9*1), R8
	CMPB (BX)(R9*1), R8
	JNE  match_nolit_end_encodeBlockAsm12B
	LEAL 1(R9), R9

match_nolit_end_encodeBlockAsm12B:
	ADDL R9, CX
	MOVL 16(SP), BX
	ADDL $0x04, R9
	MOVL CX, 12(SP)

	// emitCopy
	CMPL R9, $0x40
	JBE  two_byte_offset_short_match_nolit_encodeBlockAsm12B
	CMPL BX, $0x00000800
	JAE  long_offset_short_match_nolit_encodeBlockAsm12B
	MOVL $0x00000001, SI
	LEAL 16(SI), SI
	MOVB BL, 1(AX)
	SHRL $0x08, BX
	SHLL $0x05, BX
	ORL  BX, SI
	MOVB SI, (AX)
	ADDQ $0x02, AX
	SUBL $0x08, R9

	// emitRepeat
	LEAL -4(R9), R9
	JMP  cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
	MOVL R9, SI
	LEAL -4(R9), R9
	CMPL SI, $0x08
	JBE  repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
	CMPL SI, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
	CMPL BX, $0x00000800
	JB   repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b

cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
	CMPL R9, $0x00000104
	JB   repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
	LEAL -256(R9), R9
	MOVW $0x0019, (AX)
	MOVW R9, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B

repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
	LEAL -4(R9), R9
	MOVW $0x0015, (AX)
	MOVB R9, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B

repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
	SHLL $0x02, R9
	ORL  $0x01, R9
	MOVW R9, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B

repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
	XORQ SI, SI
	LEAL 1(SI)(R9*4), R9
	MOVB BL, 1(AX)
	SARL $0x08, BX
	SHLL $0x05, BX
	ORL  BX, R9
	MOVB R9, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B

long_offset_short_match_nolit_encodeBlockAsm12B:
	MOVB $0xee, (AX)
	MOVW BX, 1(AX)
	LEAL -60(R9), R9
	ADDQ $0x03, AX

	// emitRepeat
	MOVL R9, SI
	LEAL -4(R9), R9
	CMPL SI, $0x08
	JBE  repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short
	CMPL SI, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short
	CMPL BX, $0x00000800
	JB   repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short

cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short:
	CMPL R9, $0x00000104
	JB   repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short
	LEAL -256(R9), R9
	MOVW $0x0019, (AX)
	MOVW R9, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B

repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short:
	LEAL -4(R9), R9
	MOVW $0x0015, (AX)
	MOVB R9, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B

repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short:
	SHLL $0x02, R9
	ORL  $0x01, R9
	MOVW R9, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B

repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short:
	XORQ SI, SI
	LEAL 1(SI)(R9*4), R9
	MOVB BL, 1(AX)
	SARL $0x08, BX
	SHLL $0x05, BX
	ORL  BX, R9
	MOVB R9, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B

two_byte_offset_short_match_nolit_encodeBlockAsm12B:
	MOVL R9, SI
	SHLL $0x02, SI
	CMPL R9, $0x0c
	JAE  emit_copy_three_match_nolit_encodeBlockAsm12B
	CMPL BX, $0x00000800
	JAE  emit_copy_three_match_nolit_encodeBlockAsm12B
	LEAL -15(SI), SI
	MOVB BL, 1(AX)
	SHRL $0x08, BX
	SHLL $0x05, BX
	ORL  BX, SI
	MOVB SI, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B

emit_copy_three_match_nolit_encodeBlockAsm12B:
	LEAL -2(SI), SI
	MOVB SI, (AX)
	MOVW BX, 1(AX)
	ADDQ $0x03, AX

match_nolit_emitcopy_end_encodeBlockAsm12B:
	CMPL CX, 8(SP)
	JAE  emit_remainder_encodeBlockAsm12B
	MOVQ -2(DX)(CX*1), SI
	CMPQ AX, (SP)
	JB   match_nolit_dst_ok_encodeBlockAsm12B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_nolit_dst_ok_encodeBlockAsm12B:
	MOVQ  $0x000000cf1bbcdcbb, R8
	MOVQ  SI, DI
	SHRQ  $0x10, SI
	MOVQ  SI, BX
	SHLQ  $0x18, DI
	IMULQ R8, DI
	SHRQ  $0x34, DI
	SHLQ  $0x18, BX
	IMULQ R8, BX
	SHRQ  $0x34, BX
	LEAL  -2(CX), R8
	LEAQ  24(SP)(BX*4), R9
	MOVL  (R9), BX
	MOVL  R8, 24(SP)(DI*4)
	MOVL  CX, (R9)
	CMPL  (DX)(BX*1), SI
	JEQ   match_nolit_loop_encodeBlockAsm12B
	INCL  CX
	JMP   search_loop_encodeBlockAsm12B

emit_remainder_encodeBlockAsm12B:
	MOVQ src_len+32(FP), CX
	SUBL 12(SP), CX
	LEAQ 3(AX)(CX*1), CX
	CMPQ CX, (SP)
	JB   emit_remainder_ok_encodeBlockAsm12B
	MOVQ $0x00000000, ret+48(FP)
	RET

emit_remainder_ok_encodeBlockAsm12B:
	MOVQ src_len+32(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_encodeBlockAsm12B
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JB   one_byte_emit_remainder_encodeBlockAsm12B
	CMPL DX, $0x00000100
	JB   two_bytes_emit_remainder_encodeBlockAsm12B
	JB   three_bytes_emit_remainder_encodeBlockAsm12B

three_bytes_emit_remainder_encodeBlockAsm12B:
	MOVB $0xf4, (AX)
	MOVW DX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_encodeBlockAsm12B

two_bytes_emit_remainder_encodeBlockAsm12B:
	MOVB $0xf0, (AX)
	MOVB DL, 1(AX)
	ADDQ $0x02, AX
	CMPL DX, $0x40
	JB   memmove_emit_remainder_encodeBlockAsm12B
	JMP  memmove_long_emit_remainder_encodeBlockAsm12B

one_byte_emit_remainder_encodeBlockAsm12B:
	SHLB $0x02, DL
	MOVB DL, (AX)
	ADDQ $0x01, AX

memmove_emit_remainder_encodeBlockAsm12B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x03
	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2
	JE   emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2:
	MOVB (CX), SI
	MOVB -1(CX)(BX*1), CL
	MOVB SI, (AX)
	MOVB CL, -1(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm12B

emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3:
	MOVW (CX), SI
	MOVB 2(CX), CL
	MOVW SI, (AX)
	MOVB CL, 2(AX)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm12B

emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7:
	MOVL (CX), SI
	MOVL -4(CX)(BX*1), CX
	MOVL SI, (AX)
	MOVL CX, -4(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm12B

emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(BX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm12B

emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(BX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm12B

emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)

memmove_end_copy_emit_remainder_encodeBlockAsm12B:
	MOVQ DX, AX
	JMP  emit_literal_done_emit_remainder_encodeBlockAsm12B

memmove_long_emit_remainder_encodeBlockAsm12B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)
	MOVQ  DX, AX

emit_literal_done_emit_remainder_encodeBlockAsm12B:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ AX, ret+48(FP)
	RET

// func encodeBlockAsm10B(dst []byte, src []byte) int
// Requires: BMI, SSE2
TEXT ·encodeBlockAsm10B(SB), $4120-56
	MOVQ dst_base+0(FP), AX
	MOVQ $0x00000020, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_encodeBlockAsm10B:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_encodeBlockAsm10B
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), CX
	LEAQ  -9(CX), DX
	LEAQ  -8(CX), BX
	MOVL  BX, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  CX, 16(SP)
	MOVQ  src_base+24(FP), DX

search_loop_encodeBlockAsm10B:
	MOVL  CX, BX
	SUBL  12(SP), BX
	SHRL  $0x05, BX
	LEAL  4(CX)(BX*1), BX
	CMPL  BX, 8(SP)
	JAE   emit_remainder_encodeBlockAsm10B
	MOVQ  (DX)(CX*1), SI
	MOVL  BX, 20(SP)
	MOVQ  $0x9e3779b1, R8
	MOVQ  SI, R9
	MOVQ  SI, R10
	SHRQ  $0x08, R10
	SHLQ  $0x20, R9
	IMULQ R8, R9
	SHRQ  $0x36, R9
	SHLQ  $0x20, R10
	IMULQ R8, R10
	SHRQ  $0x36, R10
	MOVL  24(SP)(R9*4), BX
	MOVL  24(SP)(R10*4), DI
	MOVL  CX, 24(SP)(R9*4)
	LEAL  1(CX), R9
	MOVL  R9, 24(SP)(R10*4)
	MOVQ  SI, R9
	SHRQ  $0x10, R9
	SHLQ  $0x20, R9
	IMULQ R8, R9
	SHRQ  $0x36, R9
	MOVL  CX, R8
	SUBL  16(SP), R8
	MOVL  1(DX)(R8*1), R10
	MOVQ  SI, R8
	SHRQ  $0x08, R8
	CMPL  R8, R10
	JNE   no_repeat_found_encodeBlockAsm10B
	LEAL  1(CX), SI
	MOVL  12(SP), DI
	MOVL  SI, BX
	SUBL  16(SP), BX
	JZ    repeat_extend_back_end_encodeBlockAsm10B

repeat_extend_back_loop_encodeBlockAsm10B:
	CMPL SI, DI
	JBE  repeat_extend_back_end_encodeBlockAsm10B
	MOVB -1(DX)(BX*1), R8
	MOVB -1(DX)(SI*1), R9
	CMPB R8, R9
	JNE  repeat_extend_back_end_encodeBlockAsm10B
	LEAL -1(SI), SI
	DECL BX
	JNZ  repeat_extend_back_loop_encodeBlockAsm10B

repeat_extend_back_end_encodeBlockAsm10B:
	MOVL SI, BX
	SUBL 12(SP), BX
	LEAQ 3(AX)(BX*1), BX
	CMPQ BX, (SP)
	JB   repeat_dst_size_check_encodeBlockAsm10B
	MOVQ $0x00000000, ret+48(FP)
	RET

repeat_dst_size_check_encodeBlockAsm10B:
	MOVL 12(SP), BX
	CMPL BX, SI
	JEQ  emit_literal_done_repeat_emit_encodeBlockAsm10B
	MOVL SI, R8
	MOVL SI, 12(SP)
	LEAQ (DX)(BX*1), R9
	SUBL BX, R8
	LEAL -1(R8), BX
	CMPL BX, $0x3c
	JB   one_byte_repeat_emit_encodeBlockAsm10B
	CMPL BX, $0x00000100
	JB   two_bytes_repeat_emit_encodeBlockAsm10B
	JB   three_bytes_repeat_emit_encodeBlockAsm10B

three_bytes_repeat_emit_encodeBlockAsm10B:
	MOVB $0xf4, (AX)
	MOVW BX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_repeat_emit_encodeBlockAsm10B

two_bytes_repeat_emit_encodeBlockAsm10B:
	MOVB $0xf0, (AX)
	MOVB BL, 1(AX)
	ADDQ $0x02, AX
	CMPL BX, $0x40
	JB   memmove_repeat_emit_encodeBlockAsm10B
	JMP  memmove_long_repeat_emit_encodeBlockAsm10B

one_byte_repeat_emit_encodeBlockAsm10B:
	SHLB $0x02, BL
	MOVB BL, (AX)
	ADDQ $0x01, AX

memmove_repeat_emit_encodeBlockAsm10B:
	LEAQ (AX)(R8*1), BX

	// genMemMoveShort
	CMPQ R8, $0x08
	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32
	JMP  emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64

emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8:
	MOVQ (R9), R10
	MOVQ R10, (AX)
	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm10B

emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16:
	MOVQ (R9), R10
	MOVQ -8(R9)(R8*1), R9
	MOVQ R10, (AX)
	MOVQ R9, -8(AX)(R8*1)
	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm10B

emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32:
	MOVOU (R9), X0
	MOVOU -16(R9)(R8*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R8*1)
	JMP   memmove_end_copy_repeat_emit_encodeBlockAsm10B

emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64:
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)

memmove_end_copy_repeat_emit_encodeBlockAsm10B:
	MOVQ BX, AX
	JMP  emit_literal_done_repeat_emit_encodeBlockAsm10B

memmove_long_repeat_emit_encodeBlockAsm10B:
	LEAQ (AX)(R8*1), BX

	// genMemMoveLong
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVQ  R8, R11
	SHRQ  $0x05, R11
	MOVQ  AX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R12
	SUBQ  R10, R12
	DECQ  R11
	JA    emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
	LEAQ  -32(R9)(R12*1), R10
	LEAQ  -32(AX)(R12*1), R13

emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R13)
	MOVOA X5, 16(R13)
	ADDQ  $0x20, R13
	ADDQ  $0x20, R10
	ADDQ  $0x20, R12
	DECQ  R11
	JNA   emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back

emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32:
	MOVOU -32(R9)(R12*1), X4
	MOVOU -16(R9)(R12*1), X5
	MOVOA X4, -32(AX)(R12*1)
	MOVOA X5, -16(AX)(R12*1)
	ADDQ  $0x20, R12
	CMPQ  R8, R12
	JAE   emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)
	MOVQ  BX, AX

emit_literal_done_repeat_emit_encodeBlockAsm10B:
	ADDL $0x05, CX
	MOVL CX, BX
	SUBL 16(SP), BX
	MOVQ src_len+32(FP), R8
	SUBL CX, R8
	LEAQ (DX)(CX*1), R9
	LEAQ (DX)(BX*1), BX

	// matchLen
	XORL R11, R11

matchlen_loopback_16_repeat_extend_encodeBlockAsm10B:
	CMPL R8, $0x10
	JB   matchlen_match8_repeat_extend_encodeBlockAsm10B
	MOVQ (R9)(R11*1), R10
	MOVQ 8(R9)(R11*1), R12
	XORQ (BX)(R11*1), R10
	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm10B
	XORQ 8(BX)(R11*1), R12
	JNZ  matchlen_bsf_16repeat_extend_encodeBlockAsm10B
	LEAL -16(R8), R8
	LEAL 16(R11), R11
	JMP  matchlen_loopback_16_repeat_extend_encodeBlockAsm10B

matchlen_bsf_16repeat_extend_encodeBlockAsm10B:
#ifdef GOAMD64_v3
	TZCNTQ R12, R12

#else
	BSFQ R12, R12

#endif
	SARQ $0x03, R12
	LEAL 8(R11)(R12*1), R11
	JMP  repeat_extend_forward_end_encodeBlockAsm10B

matchlen_match8_repeat_extend_encodeBlockAsm10B:
	CMPL R8, $0x08
	JB   matchlen_match4_repeat_extend_encodeBlockAsm10B
	MOVQ (R9)(R11*1), R10
	XORQ (BX)(R11*1), R10
	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm10B
	LEAL -8(R8), R8
	LEAL 8(R11), R11
	JMP  matchlen_match4_repeat_extend_encodeBlockAsm10B

matchlen_bsf_8_repeat_extend_encodeBlockAsm10B:
#ifdef GOAMD64_v3
	TZCNTQ R10, R10

#else
	BSFQ R10, R10

#endif
	SARQ $0x03, R10
	LEAL (R11)(R10*1), R11
	JMP  repeat_extend_forward_end_encodeBlockAsm10B

matchlen_match4_repeat_extend_encodeBlockAsm10B:
	CMPL R8, $0x04
	JB   matchlen_match2_repeat_extend_encodeBlockAsm10B
	MOVL (R9)(R11*1), R10
	CMPL (BX)(R11*1), R10
	JNE  matchlen_match2_repeat_extend_encodeBlockAsm10B
	LEAL -4(R8), R8
	LEAL 4(R11), R11

matchlen_match2_repeat_extend_encodeBlockAsm10B:
	CMPL R8, $0x01
	JE   matchlen_match1_repeat_extend_encodeBlockAsm10B
	JB   repeat_extend_forward_end_encodeBlockAsm10B
	MOVW (R9)(R11*1), R10
	CMPW (BX)(R11*1), R10
	JNE  matchlen_match1_repeat_extend_encodeBlockAsm10B
	LEAL 2(R11), R11
	SUBL $0x02, R8
	JZ   repeat_extend_forward_end_encodeBlockAsm10B

matchlen_match1_repeat_extend_encodeBlockAsm10B:
	MOVB (R9)(R11*1), R10
	CMPB (BX)(R11*1), R10
	JNE  repeat_extend_forward_end_encodeBlockAsm10B
	LEAL 1(R11), R11

repeat_extend_forward_end_encodeBlockAsm10B:
	ADDL  R11, CX
	MOVL  CX, BX
	SUBL  SI, BX
	MOVL  16(SP), SI
	TESTL DI, DI
	JZ    repeat_as_copy_encodeBlockAsm10B

	// emitRepeat
	MOVL BX, DI
	LEAL -4(BX), BX
	CMPL DI, $0x08
	JBE  repeat_two_match_repeat_encodeBlockAsm10B
	CMPL DI, $0x0c
	JAE  cant_repeat_two_offset_match_repeat_encodeBlockAsm10B
	CMPL SI, $0x00000800
	JB   repeat_two_offset_match_repeat_encodeBlockAsm10B

cant_repeat_two_offset_match_repeat_encodeBlockAsm10B:
	CMPL BX, $0x00000104
	JB   repeat_three_match_repeat_encodeBlockAsm10B
	LEAL -256(BX), BX
	MOVW $0x0019, (AX)
	MOVW BX, 2(AX)
	ADDQ $0x04, AX
	JMP  repeat_end_emit_encodeBlockAsm10B

repeat_three_match_repeat_encodeBlockAsm10B:
	LEAL -4(BX), BX
	MOVW $0x0015, (AX)
	MOVB BL, 2(AX)
	ADDQ $0x03, AX
	JMP  repeat_end_emit_encodeBlockAsm10B

repeat_two_match_repeat_encodeBlockAsm10B:
	SHLL $0x02, BX
	ORL  $0x01, BX
	MOVW BX, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm10B

repeat_two_offset_match_repeat_encodeBlockAsm10B:
	XORQ DI, DI
	LEAL 1(DI)(BX*4), BX
	MOVB SI, 1(AX)
	SARL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, BX
	MOVB BL, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm10B

repeat_as_copy_encodeBlockAsm10B:
	// emitCopy
	CMPL BX, $0x40
	JBE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B
	CMPL SI, $0x00000800
	JAE  long_offset_short_repeat_as_copy_encodeBlockAsm10B
	MOVL $0x00000001, DI
	LEAL 16(DI), DI
	MOVB SI, 1(AX)
	SHRL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, DI
	MOVB DI, (AX)
	ADDQ $0x02, AX
	SUBL $0x08, BX

	// emitRepeat
	LEAL -4(BX), BX
	JMP  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
	MOVL BX, DI
	LEAL -4(BX), BX
	CMPL DI, $0x08
	JBE  repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
	CMPL DI, $0x0c
	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
	CMPL SI, $0x00000800
	JB   repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b

cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
	CMPL BX, $0x00000104
	JB   repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
	LEAL -256(BX), BX
	MOVW $0x0019, (AX)
	MOVW BX, 2(AX)
	ADDQ $0x04, AX
	JMP  repeat_end_emit_encodeBlockAsm10B

repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
	LEAL -4(BX), BX
	MOVW $0x0015, (AX)
	MOVB BL, 2(AX)
	ADDQ $0x03, AX
	JMP  repeat_end_emit_encodeBlockAsm10B

repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
	SHLL $0x02, BX
	ORL  $0x01, BX
	MOVW BX, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm10B

repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
	XORQ DI, DI
	LEAL 1(DI)(BX*4), BX
	MOVB SI, 1(AX)
	SARL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, BX
	MOVB BL, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm10B

long_offset_short_repeat_as_copy_encodeBlockAsm10B:
	MOVB $0xee, (AX)
	MOVW SI, 1(AX)
	LEAL -60(BX), BX
	ADDQ $0x03, AX

	// emitRepeat
	MOVL BX, DI
	LEAL -4(BX), BX
	CMPL DI, $0x08
	JBE  repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
	CMPL DI, $0x0c
	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
	CMPL SI, $0x00000800
	JB   repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short

cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
	CMPL BX, $0x00000104
	JB   repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
	LEAL -256(BX), BX
	MOVW $0x0019, (AX)
	MOVW BX, 2(AX)
	ADDQ $0x04, AX
	JMP  repeat_end_emit_encodeBlockAsm10B

repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
	LEAL -4(BX), BX
	MOVW $0x0015, (AX)
	MOVB BL, 2(AX)
	ADDQ $0x03, AX
	JMP  repeat_end_emit_encodeBlockAsm10B

repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
	SHLL $0x02, BX
	ORL  $0x01, BX
	MOVW BX, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm10B

repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
	XORQ DI, DI
	LEAL 1(DI)(BX*4), BX
	MOVB SI, 1(AX)
	SARL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, BX
	MOVB BL, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm10B

two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B:
	MOVL BX, DI
	SHLL $0x02, DI
	CMPL BX, $0x0c
	JAE  emit_copy_three_repeat_as_copy_encodeBlockAsm10B
	CMPL SI, $0x00000800
	JAE  emit_copy_three_repeat_as_copy_encodeBlockAsm10B
	LEAL -15(DI), DI
	MOVB SI, 1(AX)
	SHRL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, DI
	MOVB DI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm10B

emit_copy_three_repeat_as_copy_encodeBlockAsm10B:
	LEAL -2(DI), DI
	MOVB DI, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX

repeat_end_emit_encodeBlockAsm10B:
	MOVL CX, 12(SP)
	JMP  search_loop_encodeBlockAsm10B

no_repeat_found_encodeBlockAsm10B:
	CMPL (DX)(BX*1), SI
	JEQ  candidate_match_encodeBlockAsm10B
	SHRQ $0x08, SI
	MOVL 24(SP)(R9*4), BX
	LEAL 2(CX), R8
	CMPL (DX)(DI*1), SI
	JEQ  candidate2_match_encodeBlockAsm10B
	MOVL R8, 24(SP)(R9*4)
	SHRQ $0x08, SI
	CMPL (DX)(BX*1), SI
	JEQ  candidate3_match_encodeBlockAsm10B
	MOVL 20(SP), CX
	JMP  search_loop_encodeBlockAsm10B

candidate3_match_encodeBlockAsm10B:
	ADDL $0x02, CX
	JMP  candidate_match_encodeBlockAsm10B

candidate2_match_encodeBlockAsm10B:
	MOVL R8, 24(SP)(R9*4)
	INCL CX
	MOVL DI, BX

candidate_match_encodeBlockAsm10B:
	MOVL  12(SP), SI
	TESTL BX, BX
	JZ    match_extend_back_end_encodeBlockAsm10B

match_extend_back_loop_encodeBlockAsm10B:
	CMPL CX, SI
	JBE  match_extend_back_end_encodeBlockAsm10B
	MOVB -1(DX)(BX*1), DI
	MOVB -1(DX)(CX*1), R8
	CMPB DI, R8
	JNE  match_extend_back_end_encodeBlockAsm10B
	LEAL -1(CX), CX
	DECL BX
	JZ   match_extend_back_end_encodeBlockAsm10B
	JMP  match_extend_back_loop_encodeBlockAsm10B

match_extend_back_end_encodeBlockAsm10B:
	MOVL CX, SI
	SUBL 12(SP), SI
	LEAQ 3(AX)(SI*1), SI
	CMPQ SI, (SP)
	JB   match_dst_size_check_encodeBlockAsm10B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_dst_size_check_encodeBlockAsm10B:
	MOVL CX, SI
	MOVL 12(SP), DI
	CMPL DI, SI
	JEQ  emit_literal_done_match_emit_encodeBlockAsm10B
	MOVL SI, R8
	MOVL SI, 12(SP)
	LEAQ (DX)(DI*1), SI
	SUBL DI, R8
	LEAL -1(R8), DI
	CMPL DI, $0x3c
	JB   one_byte_match_emit_encodeBlockAsm10B
	CMPL DI, $0x00000100
	JB   two_bytes_match_emit_encodeBlockAsm10B
	JB   three_bytes_match_emit_encodeBlockAsm10B

three_bytes_match_emit_encodeBlockAsm10B:
	MOVB $0xf4, (AX)
	MOVW DI, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_encodeBlockAsm10B

two_bytes_match_emit_encodeBlockAsm10B:
	MOVB $0xf0, (AX)
	MOVB DI, 1(AX)
	ADDQ $0x02, AX
	CMPL DI, $0x40
	JB   memmove_match_emit_encodeBlockAsm10B
	JMP  memmove_long_match_emit_encodeBlockAsm10B

one_byte_match_emit_encodeBlockAsm10B:
	SHLB $0x02, DI
	MOVB DI, (AX)
	ADDQ $0x01, AX

memmove_match_emit_encodeBlockAsm10B:
	LEAQ (AX)(R8*1), DI

	// genMemMoveShort
	CMPQ R8, $0x08
	JBE  emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64

emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8:
	MOVQ (SI), R9
	MOVQ R9, (AX)
	JMP  memmove_end_copy_match_emit_encodeBlockAsm10B

emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16:
	MOVQ (SI), R9
	MOVQ -8(SI)(R8*1), SI
	MOVQ R9, (AX)
	MOVQ SI, -8(AX)(R8*1)
	JMP  memmove_end_copy_match_emit_encodeBlockAsm10B

emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32:
	MOVOU (SI), X0
	MOVOU -16(SI)(R8*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R8*1)
	JMP   memmove_end_copy_match_emit_encodeBlockAsm10B

emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64:
	MOVOU (SI), X0
	MOVOU 16(SI), X1
	MOVOU -32(SI)(R8*1), X2
	MOVOU -16(SI)(R8*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)

memmove_end_copy_match_emit_encodeBlockAsm10B:
	MOVQ DI, AX
	JMP  emit_literal_done_match_emit_encodeBlockAsm10B

memmove_long_match_emit_encodeBlockAsm10B:
	LEAQ (AX)(R8*1), DI

	// genMemMoveLong
	MOVOU (SI), X0
	MOVOU 16(SI), X1
	MOVOU -32(SI)(R8*1), X2
	MOVOU -16(SI)(R8*1), X3
	MOVQ  R8, R10
	SHRQ  $0x05, R10
	MOVQ  AX, R9
	ANDL  $0x0000001f, R9
	MOVQ  $0x00000040, R11
	SUBQ  R9, R11
	DECQ  R10
	JA    emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
	LEAQ  -32(SI)(R11*1), R9
	LEAQ  -32(AX)(R11*1), R12

emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back:
	MOVOU (R9), X4
	MOVOU 16(R9), X5
	MOVOA X4, (R12)
	MOVOA X5, 16(R12)
	ADDQ  $0x20, R12
	ADDQ  $0x20, R9
	ADDQ  $0x20, R11
	DECQ  R10
	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32:
	MOVOU -32(SI)(R11*1), X4
	MOVOU -16(SI)(R11*1), X5
	MOVOA X4, -32(AX)(R11*1)
	MOVOA X5, -16(AX)(R11*1)
	ADDQ  $0x20, R11
	CMPQ  R8, R11
	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)
	MOVQ  DI, AX

emit_literal_done_match_emit_encodeBlockAsm10B:
match_nolit_loop_encodeBlockAsm10B:
	MOVL CX, SI
	SUBL BX, SI
	MOVL SI, 16(SP)
	ADDL $0x04, CX
	ADDL $0x04, BX
	MOVQ src_len+32(FP), SI
	SUBL CX, SI
	LEAQ (DX)(CX*1), DI
	LEAQ (DX)(BX*1), BX

	// matchLen
	XORL R9, R9

matchlen_loopback_16_match_nolit_encodeBlockAsm10B:
	CMPL SI, $0x10
	JB   matchlen_match8_match_nolit_encodeBlockAsm10B
	MOVQ (DI)(R9*1), R8
	MOVQ 8(DI)(R9*1), R10
	XORQ (BX)(R9*1), R8
	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm10B
	XORQ 8(BX)(R9*1), R10
	JNZ  matchlen_bsf_16match_nolit_encodeBlockAsm10B
	LEAL -16(SI), SI
	LEAL 16(R9), R9
	JMP  matchlen_loopback_16_match_nolit_encodeBlockAsm10B

matchlen_bsf_16match_nolit_encodeBlockAsm10B:
#ifdef GOAMD64_v3
	TZCNTQ R10, R10

#else
	BSFQ R10, R10

#endif
	SARQ $0x03, R10
	LEAL 8(R9)(R10*1), R9
	JMP  match_nolit_end_encodeBlockAsm10B

matchlen_match8_match_nolit_encodeBlockAsm10B:
	CMPL SI, $0x08
	JB   matchlen_match4_match_nolit_encodeBlockAsm10B
	MOVQ (DI)(R9*1), R8
	XORQ (BX)(R9*1), R8
	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm10B
	LEAL -8(SI), SI
	LEAL 8(R9), R9
	JMP  matchlen_match4_match_nolit_encodeBlockAsm10B

matchlen_bsf_8_match_nolit_encodeBlockAsm10B:
#ifdef GOAMD64_v3
	TZCNTQ R8, R8

#else
	BSFQ R8, R8

#endif
	SARQ $0x03, R8
	LEAL (R9)(R8*1), R9
	JMP  match_nolit_end_encodeBlockAsm10B

matchlen_match4_match_nolit_encodeBlockAsm10B:
	CMPL SI, $0x04
	JB   matchlen_match2_match_nolit_encodeBlockAsm10B
	MOVL (DI)(R9*1), R8
	CMPL (BX)(R9*1), R8
	JNE  matchlen_match2_match_nolit_encodeBlockAsm10B
	LEAL -4(SI), SI
	LEAL 4(R9), R9

matchlen_match2_match_nolit_encodeBlockAsm10B:
	CMPL SI, $0x01
	JE   matchlen_match1_match_nolit_encodeBlockAsm10B
	JB   match_nolit_end_encodeBlockAsm10B
	MOVW (DI)(R9*1), R8
	CMPW (BX)(R9*1), R8
	JNE  matchlen_match1_match_nolit_encodeBlockAsm10B
	LEAL 2(R9), R9
	SUBL $0x02, SI
	JZ   match_nolit_end_encodeBlockAsm10B

matchlen_match1_match_nolit_encodeBlockAsm10B:
	MOVB (DI)(R9*1), R8
	CMPB (BX)(R9*1), R8
	JNE  match_nolit_end_encodeBlockAsm10B
	LEAL 1(R9), R9

match_nolit_end_encodeBlockAsm10B:
	ADDL R9, CX
	MOVL 16(SP), BX
	ADDL $0x04, R9
	MOVL CX, 12(SP)

	// emitCopy
	CMPL R9, $0x40
	JBE  two_byte_offset_short_match_nolit_encodeBlockAsm10B
	CMPL BX, $0x00000800
	JAE  long_offset_short_match_nolit_encodeBlockAsm10B
	MOVL $0x00000001, SI
	LEAL 16(SI), SI
	MOVB BL, 1(AX)
	SHRL $0x08, BX
	SHLL $0x05, BX
	ORL  BX, SI
	MOVB SI, (AX)
	ADDQ $0x02, AX
	SUBL $0x08, R9

	// emitRepeat
	LEAL -4(R9), R9
	JMP  cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
	MOVL R9, SI
	LEAL -4(R9), R9
	CMPL SI, $0x08
	JBE  repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
	CMPL SI, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
	CMPL BX, $0x00000800
	JB   repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b

cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
	CMPL R9, $0x00000104
	JB   repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
	LEAL -256(R9), R9
	MOVW $0x0019, (AX)
	MOVW R9, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B

repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
	LEAL -4(R9), R9
	MOVW $0x0015, (AX)
	MOVB R9, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B

repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
	SHLL $0x02, R9
	ORL  $0x01, R9
	MOVW R9, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B

repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
	XORQ SI, SI
	LEAL 1(SI)(R9*4), R9
	MOVB BL, 1(AX)
	SARL $0x08, BX
	SHLL $0x05, BX
	ORL  BX, R9
	MOVB R9, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B

long_offset_short_match_nolit_encodeBlockAsm10B:
	MOVB $0xee, (AX)
	MOVW BX, 1(AX)
	LEAL -60(R9), R9
	ADDQ $0x03, AX

	// emitRepeat
	MOVL R9, SI
	LEAL -4(R9), R9
	CMPL SI, $0x08
	JBE  repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short
	CMPL SI, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short
	CMPL BX, $0x00000800
	JB   repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short

cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short:
	CMPL R9, $0x00000104
	JB   repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short
	LEAL -256(R9), R9
	MOVW $0x0019, (AX)
	MOVW R9, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B

repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short:
	LEAL -4(R9), R9
	MOVW $0x0015, (AX)
	MOVB R9, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B

repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short:
	SHLL $0x02, R9
	ORL  $0x01, R9
	MOVW R9, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B

repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short:
	XORQ SI, SI
	LEAL 1(SI)(R9*4), R9
	MOVB BL, 1(AX)
	SARL $0x08, BX
	SHLL $0x05, BX
	ORL  BX, R9
	MOVB R9, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B

two_byte_offset_short_match_nolit_encodeBlockAsm10B:
	MOVL R9, SI
	SHLL $0x02, SI
	CMPL R9, $0x0c
	JAE  emit_copy_three_match_nolit_encodeBlockAsm10B
	CMPL BX, $0x00000800
	JAE  emit_copy_three_match_nolit_encodeBlockAsm10B
	LEAL -15(SI), SI
	MOVB BL, 1(AX)
	SHRL $0x08, BX
	SHLL $0x05, BX
	ORL  BX, SI
	MOVB SI, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B

emit_copy_three_match_nolit_encodeBlockAsm10B:
	LEAL -2(SI), SI
	MOVB SI, (AX)
	MOVW BX, 1(AX)
	ADDQ $0x03, AX

match_nolit_emitcopy_end_encodeBlockAsm10B:
	CMPL CX, 8(SP)
	JAE  emit_remainder_encodeBlockAsm10B
	MOVQ -2(DX)(CX*1), SI
	CMPQ AX, (SP)
	JB   match_nolit_dst_ok_encodeBlockAsm10B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_nolit_dst_ok_encodeBlockAsm10B:
	MOVQ  $0x9e3779b1, R8
	MOVQ  SI, DI
	SHRQ  $0x10, SI
	MOVQ  SI, BX
	SHLQ  $0x20, DI
	IMULQ R8, DI
	SHRQ  $0x36, DI
	SHLQ  $0x20, BX
	IMULQ R8, BX
	SHRQ  $0x36, BX
	LEAL  -2(CX), R8
	LEAQ  24(SP)(BX*4), R9
	MOVL  (R9), BX
	MOVL  R8, 24(SP)(DI*4)
	MOVL  CX, (R9)
	CMPL  (DX)(BX*1), SI
	JEQ   match_nolit_loop_encodeBlockAsm10B
	INCL  CX
	JMP   search_loop_encodeBlockAsm10B

emit_remainder_encodeBlockAsm10B:
	MOVQ src_len+32(FP), CX
	SUBL 12(SP), CX
	LEAQ 3(AX)(CX*1), CX
	CMPQ CX, (SP)
	JB   emit_remainder_ok_encodeBlockAsm10B
	MOVQ $0x00000000, ret+48(FP)
	RET

emit_remainder_ok_encodeBlockAsm10B:
	MOVQ src_len+32(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_encodeBlockAsm10B
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JB   one_byte_emit_remainder_encodeBlockAsm10B
	CMPL DX, $0x00000100
	JB   two_bytes_emit_remainder_encodeBlockAsm10B
	JB   three_bytes_emit_remainder_encodeBlockAsm10B

three_bytes_emit_remainder_encodeBlockAsm10B:
	MOVB $0xf4, (AX)
	MOVW DX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_encodeBlockAsm10B

two_bytes_emit_remainder_encodeBlockAsm10B:
	MOVB $0xf0, (AX)
	MOVB DL, 1(AX)
	ADDQ $0x02, AX
	CMPL DX, $0x40
	JB   memmove_emit_remainder_encodeBlockAsm10B
	JMP  memmove_long_emit_remainder_encodeBlockAsm10B

one_byte_emit_remainder_encodeBlockAsm10B:
	SHLB $0x02, DL
	MOVB DL, (AX)
	ADDQ $0x01, AX

memmove_emit_remainder_encodeBlockAsm10B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x03
	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2
	JE   emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2:
	MOVB (CX), SI
	MOVB -1(CX)(BX*1), CL
	MOVB SI, (AX)
	MOVB CL, -1(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm10B

emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3:
	MOVW (CX), SI
	MOVB 2(CX), CL
	MOVW SI, (AX)
	MOVB CL, 2(AX)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm10B

emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7:
	MOVL (CX), SI
	MOVL -4(CX)(BX*1), CX
	MOVL SI, (AX)
	MOVL CX, -4(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm10B

emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(BX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm10B

emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(BX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm10B

emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)

memmove_end_copy_emit_remainder_encodeBlockAsm10B:
	MOVQ DX, AX
	JMP  emit_literal_done_emit_remainder_encodeBlockAsm10B

memmove_long_emit_remainder_encodeBlockAsm10B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)
	MOVQ  DX, AX

emit_literal_done_emit_remainder_encodeBlockAsm10B:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ AX, ret+48(FP)
	RET

// func encodeBlockAsm8B(dst []byte, src []byte) int
// Requires: BMI, SSE2
TEXT ·encodeBlockAsm8B(SB), $1048-56
	MOVQ dst_base+0(FP), AX
	MOVQ $0x00000008, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_encodeBlockAsm8B:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_encodeBlockAsm8B
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), CX
	LEAQ  -9(CX), DX
	LEAQ  -8(CX), BX
	MOVL  BX, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  CX, 16(SP)
	MOVQ  src_base+24(FP), DX

search_loop_encodeBlockAsm8B:
	MOVL  CX, BX
	SUBL  12(SP), BX
	SHRL  $0x04, BX
	LEAL  4(CX)(BX*1), BX
	CMPL  BX, 8(SP)
	JAE   emit_remainder_encodeBlockAsm8B
	MOVQ  (DX)(CX*1), SI
	MOVL  BX, 20(SP)
	MOVQ  $0x9e3779b1, R8
	MOVQ  SI, R9
	MOVQ  SI, R10
	SHRQ  $0x08, R10
	SHLQ  $0x20, R9
	IMULQ R8, R9
	SHRQ  $0x38, R9
	SHLQ  $0x20, R10
	IMULQ R8, R10
	SHRQ  $0x38, R10
	MOVL  24(SP)(R9*4), BX
	MOVL  24(SP)(R10*4), DI
	MOVL  CX, 24(SP)(R9*4)
	LEAL  1(CX), R9
	MOVL  R9, 24(SP)(R10*4)
	MOVQ  SI, R9
	SHRQ  $0x10, R9
	SHLQ  $0x20, R9
	IMULQ R8, R9
	SHRQ  $0x38, R9
	MOVL  CX, R8
	SUBL  16(SP), R8
	MOVL  1(DX)(R8*1), R10
	MOVQ  SI, R8
	SHRQ  $0x08, R8
	CMPL  R8, R10
	JNE   no_repeat_found_encodeBlockAsm8B
	LEAL  1(CX), SI
	MOVL  12(SP), DI
	MOVL  SI, BX
	SUBL  16(SP), BX
	JZ    repeat_extend_back_end_encodeBlockAsm8B

repeat_extend_back_loop_encodeBlockAsm8B:
	CMPL SI, DI
	JBE  repeat_extend_back_end_encodeBlockAsm8B
	MOVB -1(DX)(BX*1), R8
	MOVB -1(DX)(SI*1), R9
	CMPB R8, R9
	JNE  repeat_extend_back_end_encodeBlockAsm8B
	LEAL -1(SI), SI
	DECL BX
	JNZ  repeat_extend_back_loop_encodeBlockAsm8B

repeat_extend_back_end_encodeBlockAsm8B:
	MOVL SI, BX
	SUBL 12(SP), BX
	LEAQ 3(AX)(BX*1), BX
	CMPQ BX, (SP)
	JB   repeat_dst_size_check_encodeBlockAsm8B
	MOVQ $0x00000000, ret+48(FP)
	RET

repeat_dst_size_check_encodeBlockAsm8B:
	MOVL 12(SP), BX
	CMPL BX, SI
	JEQ  emit_literal_done_repeat_emit_encodeBlockAsm8B
	MOVL SI, R8
	MOVL SI, 12(SP)
	LEAQ (DX)(BX*1), R9
	SUBL BX, R8
	LEAL -1(R8), BX
	CMPL BX, $0x3c
	JB   one_byte_repeat_emit_encodeBlockAsm8B
	CMPL BX, $0x00000100
	JB   two_bytes_repeat_emit_encodeBlockAsm8B
	JB   three_bytes_repeat_emit_encodeBlockAsm8B

three_bytes_repeat_emit_encodeBlockAsm8B:
	MOVB $0xf4, (AX)
	MOVW BX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_repeat_emit_encodeBlockAsm8B

two_bytes_repeat_emit_encodeBlockAsm8B:
	MOVB $0xf0, (AX)
	MOVB BL, 1(AX)
	ADDQ $0x02, AX
	CMPL BX, $0x40
	JB   memmove_repeat_emit_encodeBlockAsm8B
	JMP  memmove_long_repeat_emit_encodeBlockAsm8B

one_byte_repeat_emit_encodeBlockAsm8B:
	SHLB $0x02, BL
	MOVB BL, (AX)
	ADDQ $0x01, AX

memmove_repeat_emit_encodeBlockAsm8B:
	LEAQ (AX)(R8*1), BX

	// genMemMoveShort
	CMPQ R8, $0x08
	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32
	JMP  emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64

emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8:
	MOVQ (R9), R10
	MOVQ R10, (AX)
	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm8B

emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16:
	MOVQ (R9), R10
	MOVQ -8(R9)(R8*1), R9
	MOVQ R10, (AX)
	MOVQ R9, -8(AX)(R8*1)
	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm8B

emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32:
	MOVOU (R9), X0
	MOVOU -16(R9)(R8*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R8*1)
	JMP   memmove_end_copy_repeat_emit_encodeBlockAsm8B

emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64:
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)

memmove_end_copy_repeat_emit_encodeBlockAsm8B:
	MOVQ BX, AX
	JMP  emit_literal_done_repeat_emit_encodeBlockAsm8B

memmove_long_repeat_emit_encodeBlockAsm8B:
	LEAQ (AX)(R8*1), BX

	// genMemMoveLong
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVQ  R8, R11
	SHRQ  $0x05, R11
	MOVQ  AX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R12
	SUBQ  R10, R12
	DECQ  R11
	JA    emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
	LEAQ  -32(R9)(R12*1), R10
	LEAQ  -32(AX)(R12*1), R13

emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R13)
	MOVOA X5, 16(R13)
	ADDQ  $0x20, R13
	ADDQ  $0x20, R10
	ADDQ  $0x20, R12
	DECQ  R11
	JNA   emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back

emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32:
	MOVOU -32(R9)(R12*1), X4
	MOVOU -16(R9)(R12*1), X5
	MOVOA X4, -32(AX)(R12*1)
	MOVOA X5, -16(AX)(R12*1)
	ADDQ  $0x20, R12
	CMPQ  R8, R12
	JAE   emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)
	MOVQ  BX, AX

emit_literal_done_repeat_emit_encodeBlockAsm8B:
	ADDL $0x05, CX
	MOVL CX, BX
	SUBL 16(SP), BX
	MOVQ src_len+32(FP), R8
	SUBL CX, R8
	LEAQ (DX)(CX*1), R9
	LEAQ (DX)(BX*1), BX

	// matchLen
	XORL R11, R11

matchlen_loopback_16_repeat_extend_encodeBlockAsm8B:
	CMPL R8, $0x10
	JB   matchlen_match8_repeat_extend_encodeBlockAsm8B
	MOVQ (R9)(R11*1), R10
	MOVQ 8(R9)(R11*1), R12
	XORQ (BX)(R11*1), R10
	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm8B
	XORQ 8(BX)(R11*1), R12
	JNZ  matchlen_bsf_16repeat_extend_encodeBlockAsm8B
	LEAL -16(R8), R8
	LEAL 16(R11), R11
	JMP  matchlen_loopback_16_repeat_extend_encodeBlockAsm8B

matchlen_bsf_16repeat_extend_encodeBlockAsm8B:
#ifdef GOAMD64_v3
	TZCNTQ R12, R12

#else
	BSFQ R12, R12

#endif
	SARQ $0x03, R12
	LEAL 8(R11)(R12*1), R11
	JMP  repeat_extend_forward_end_encodeBlockAsm8B

matchlen_match8_repeat_extend_encodeBlockAsm8B:
	CMPL R8, $0x08
	JB   matchlen_match4_repeat_extend_encodeBlockAsm8B
	MOVQ (R9)(R11*1), R10
	XORQ (BX)(R11*1), R10
	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm8B
	LEAL -8(R8), R8
	LEAL 8(R11), R11
	JMP  matchlen_match4_repeat_extend_encodeBlockAsm8B

matchlen_bsf_8_repeat_extend_encodeBlockAsm8B:
#ifdef GOAMD64_v3
	TZCNTQ R10, R10

#else
	BSFQ R10, R10

#endif
	SARQ $0x03, R10
	LEAL (R11)(R10*1), R11
	JMP  repeat_extend_forward_end_encodeBlockAsm8B

matchlen_match4_repeat_extend_encodeBlockAsm8B:
	CMPL R8, $0x04
	JB   matchlen_match2_repeat_extend_encodeBlockAsm8B
	MOVL (R9)(R11*1), R10
	CMPL (BX)(R11*1), R10
	JNE  matchlen_match2_repeat_extend_encodeBlockAsm8B
	LEAL -4(R8), R8
	LEAL 4(R11), R11

matchlen_match2_repeat_extend_encodeBlockAsm8B:
	CMPL R8, $0x01
	JE   matchlen_match1_repeat_extend_encodeBlockAsm8B
	JB   repeat_extend_forward_end_encodeBlockAsm8B
	MOVW (R9)(R11*1), R10
	CMPW (BX)(R11*1), R10
	JNE  matchlen_match1_repeat_extend_encodeBlockAsm8B
	LEAL 2(R11), R11
	SUBL $0x02, R8
	JZ   repeat_extend_forward_end_encodeBlockAsm8B

matchlen_match1_repeat_extend_encodeBlockAsm8B:
	MOVB (R9)(R11*1), R10
	CMPB (BX)(R11*1), R10
	JNE  repeat_extend_forward_end_encodeBlockAsm8B
	LEAL 1(R11), R11

repeat_extend_forward_end_encodeBlockAsm8B:
	ADDL  R11, CX
	MOVL  CX, BX
	SUBL  SI, BX
	MOVL  16(SP), SI
	TESTL DI, DI
	JZ    repeat_as_copy_encodeBlockAsm8B

	// emitRepeat
	MOVL BX, SI
	LEAL -4(BX), BX
	CMPL SI, $0x08
	JBE  repeat_two_match_repeat_encodeBlockAsm8B
	CMPL SI, $0x0c
	JAE  cant_repeat_two_offset_match_repeat_encodeBlockAsm8B

cant_repeat_two_offset_match_repeat_encodeBlockAsm8B:
	CMPL BX, $0x00000104
	JB   repeat_three_match_repeat_encodeBlockAsm8B
	LEAL -256(BX), BX
	MOVW $0x0019, (AX)
	MOVW BX, 2(AX)
	ADDQ $0x04, AX
	JMP  repeat_end_emit_encodeBlockAsm8B

repeat_three_match_repeat_encodeBlockAsm8B:
	LEAL -4(BX), BX
	MOVW $0x0015, (AX)
	MOVB BL, 2(AX)
	ADDQ $0x03, AX
	JMP  repeat_end_emit_encodeBlockAsm8B

repeat_two_match_repeat_encodeBlockAsm8B:
	SHLL $0x02, BX
	ORL  $0x01, BX
	MOVW BX, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm8B
	XORQ DI, DI
	LEAL 1(DI)(BX*4), BX
	MOVB SI, 1(AX)
	SARL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, BX
	MOVB BL, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm8B

repeat_as_copy_encodeBlockAsm8B:
	// emitCopy
	CMPL BX, $0x40
	JBE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B
	CMPL SI, $0x00000800
	JAE  long_offset_short_repeat_as_copy_encodeBlockAsm8B
	MOVL $0x00000001, DI
	LEAL 16(DI), DI
	MOVB SI, 1(AX)
	SHRL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, DI
	MOVB DI, (AX)
	ADDQ $0x02, AX
	SUBL $0x08, BX

	// emitRepeat
	LEAL -4(BX), BX
	JMP  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
	MOVL BX, SI
	LEAL -4(BX), BX
	CMPL SI, $0x08
	JBE  repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
	CMPL SI, $0x0c
	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b

cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b:
	CMPL BX, $0x00000104
	JB   repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
	LEAL -256(BX), BX
	MOVW $0x0019, (AX)
	MOVW BX, 2(AX)
	ADDQ $0x04, AX
	JMP  repeat_end_emit_encodeBlockAsm8B

repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b:
	LEAL -4(BX), BX
	MOVW $0x0015, (AX)
	MOVB BL, 2(AX)
	ADDQ $0x03, AX
	JMP  repeat_end_emit_encodeBlockAsm8B

repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b:
	SHLL $0x02, BX
	ORL  $0x01, BX
	MOVW BX, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm8B
	XORQ DI, DI
	LEAL 1(DI)(BX*4), BX
	MOVB SI, 1(AX)
	SARL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, BX
	MOVB BL, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm8B

long_offset_short_repeat_as_copy_encodeBlockAsm8B:
	MOVB $0xee, (AX)
	MOVW SI, 1(AX)
	LEAL -60(BX), BX
	ADDQ $0x03, AX

	// emitRepeat
	MOVL BX, SI
	LEAL -4(BX), BX
	CMPL SI, $0x08
	JBE  repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
	CMPL SI, $0x0c
	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short

cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
	CMPL BX, $0x00000104
	JB   repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
	LEAL -256(BX), BX
	MOVW $0x0019, (AX)
	MOVW BX, 2(AX)
	ADDQ $0x04, AX
	JMP  repeat_end_emit_encodeBlockAsm8B

repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
	LEAL -4(BX), BX
	MOVW $0x0015, (AX)
	MOVB BL, 2(AX)
	ADDQ $0x03, AX
	JMP  repeat_end_emit_encodeBlockAsm8B

repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
	SHLL $0x02, BX
	ORL  $0x01, BX
	MOVW BX, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm8B
	XORQ DI, DI
	LEAL 1(DI)(BX*4), BX
	MOVB SI, 1(AX)
	SARL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, BX
	MOVB BL, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm8B

two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B:
	MOVL BX, DI
	SHLL $0x02, DI
	CMPL BX, $0x0c
	JAE  emit_copy_three_repeat_as_copy_encodeBlockAsm8B
	LEAL -15(DI), DI
	MOVB SI, 1(AX)
	SHRL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, DI
	MOVB DI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeBlockAsm8B

emit_copy_three_repeat_as_copy_encodeBlockAsm8B:
	LEAL -2(DI), DI
	MOVB DI, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX

repeat_end_emit_encodeBlockAsm8B:
	MOVL CX, 12(SP)
	JMP  search_loop_encodeBlockAsm8B

no_repeat_found_encodeBlockAsm8B:
	CMPL (DX)(BX*1), SI
	JEQ  candidate_match_encodeBlockAsm8B
	SHRQ $0x08, SI
	MOVL 24(SP)(R9*4), BX
	LEAL 2(CX), R8
	CMPL (DX)(DI*1), SI
	JEQ  candidate2_match_encodeBlockAsm8B
	MOVL R8, 24(SP)(R9*4)
	SHRQ $0x08, SI
	CMPL (DX)(BX*1), SI
	JEQ  candidate3_match_encodeBlockAsm8B
	MOVL 20(SP), CX
	JMP  search_loop_encodeBlockAsm8B

candidate3_match_encodeBlockAsm8B:
	ADDL $0x02, CX
	JMP  candidate_match_encodeBlockAsm8B

candidate2_match_encodeBlockAsm8B:
	MOVL R8, 24(SP)(R9*4)
	INCL CX
	MOVL DI, BX

candidate_match_encodeBlockAsm8B:
	MOVL  12(SP), SI
	TESTL BX, BX
	JZ    match_extend_back_end_encodeBlockAsm8B

match_extend_back_loop_encodeBlockAsm8B:
	CMPL CX, SI
	JBE  match_extend_back_end_encodeBlockAsm8B
	MOVB -1(DX)(BX*1), DI
	MOVB -1(DX)(CX*1), R8
	CMPB DI, R8
	JNE  match_extend_back_end_encodeBlockAsm8B
	LEAL -1(CX), CX
	DECL BX
	JZ   match_extend_back_end_encodeBlockAsm8B
	JMP  match_extend_back_loop_encodeBlockAsm8B

match_extend_back_end_encodeBlockAsm8B:
	MOVL CX, SI
	SUBL 12(SP), SI
	LEAQ 3(AX)(SI*1), SI
	CMPQ SI, (SP)
	JB   match_dst_size_check_encodeBlockAsm8B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_dst_size_check_encodeBlockAsm8B:
	MOVL CX, SI
	MOVL 12(SP), DI
	CMPL DI, SI
	JEQ  emit_literal_done_match_emit_encodeBlockAsm8B
	MOVL SI, R8
	MOVL SI, 12(SP)
	LEAQ (DX)(DI*1), SI
	SUBL DI, R8
	LEAL -1(R8), DI
	CMPL DI, $0x3c
	JB   one_byte_match_emit_encodeBlockAsm8B
	CMPL DI, $0x00000100
	JB   two_bytes_match_emit_encodeBlockAsm8B
	JB   three_bytes_match_emit_encodeBlockAsm8B

three_bytes_match_emit_encodeBlockAsm8B:
	MOVB $0xf4, (AX)
	MOVW DI, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_encodeBlockAsm8B

two_bytes_match_emit_encodeBlockAsm8B:
	MOVB $0xf0, (AX)
	MOVB DI, 1(AX)
	ADDQ $0x02, AX
	CMPL DI, $0x40
	JB   memmove_match_emit_encodeBlockAsm8B
	JMP  memmove_long_match_emit_encodeBlockAsm8B

one_byte_match_emit_encodeBlockAsm8B:
	SHLB $0x02, DI
	MOVB DI, (AX)
	ADDQ $0x01, AX

memmove_match_emit_encodeBlockAsm8B:
	LEAQ (AX)(R8*1), DI

	// genMemMoveShort
	CMPQ R8, $0x08
	JBE  emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64

emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8:
	MOVQ (SI), R9
	MOVQ R9, (AX)
	JMP  memmove_end_copy_match_emit_encodeBlockAsm8B

emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16:
	MOVQ (SI), R9
	MOVQ -8(SI)(R8*1), SI
	MOVQ R9, (AX)
	MOVQ SI, -8(AX)(R8*1)
	JMP  memmove_end_copy_match_emit_encodeBlockAsm8B

emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32:
	MOVOU (SI), X0
	MOVOU -16(SI)(R8*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R8*1)
	JMP   memmove_end_copy_match_emit_encodeBlockAsm8B

emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64:
	MOVOU (SI), X0
	MOVOU 16(SI), X1
	MOVOU -32(SI)(R8*1), X2
	MOVOU -16(SI)(R8*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)

memmove_end_copy_match_emit_encodeBlockAsm8B:
	MOVQ DI, AX
	JMP  emit_literal_done_match_emit_encodeBlockAsm8B

memmove_long_match_emit_encodeBlockAsm8B:
	LEAQ (AX)(R8*1), DI

	// genMemMoveLong
	MOVOU (SI), X0
	MOVOU 16(SI), X1
	MOVOU -32(SI)(R8*1), X2
	MOVOU -16(SI)(R8*1), X3
	MOVQ  R8, R10
	SHRQ  $0x05, R10
	MOVQ  AX, R9
	ANDL  $0x0000001f, R9
	MOVQ  $0x00000040, R11
	SUBQ  R9, R11
	DECQ  R10
	JA    emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
	LEAQ  -32(SI)(R11*1), R9
	LEAQ  -32(AX)(R11*1), R12

emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back:
	MOVOU (R9), X4
	MOVOU 16(R9), X5
	MOVOA X4, (R12)
	MOVOA X5, 16(R12)
	ADDQ  $0x20, R12
	ADDQ  $0x20, R9
	ADDQ  $0x20, R11
	DECQ  R10
	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32:
	MOVOU -32(SI)(R11*1), X4
	MOVOU -16(SI)(R11*1), X5
	MOVOA X4, -32(AX)(R11*1)
	MOVOA X5, -16(AX)(R11*1)
	ADDQ  $0x20, R11
	CMPQ  R8, R11
	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)
	MOVQ  DI, AX

emit_literal_done_match_emit_encodeBlockAsm8B:
match_nolit_loop_encodeBlockAsm8B:
	MOVL CX, SI
	SUBL BX, SI
	MOVL SI, 16(SP)
	ADDL $0x04, CX
	ADDL $0x04, BX
	MOVQ src_len+32(FP), SI
	SUBL CX, SI
	LEAQ (DX)(CX*1), DI
	LEAQ (DX)(BX*1), BX

	// matchLen
	XORL R9, R9

matchlen_loopback_16_match_nolit_encodeBlockAsm8B:
	CMPL SI, $0x10
	JB   matchlen_match8_match_nolit_encodeBlockAsm8B
	MOVQ (DI)(R9*1), R8
	MOVQ 8(DI)(R9*1), R10
	XORQ (BX)(R9*1), R8
	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm8B
	XORQ 8(BX)(R9*1), R10
	JNZ  matchlen_bsf_16match_nolit_encodeBlockAsm8B
	LEAL -16(SI), SI
	LEAL 16(R9), R9
	JMP  matchlen_loopback_16_match_nolit_encodeBlockAsm8B

matchlen_bsf_16match_nolit_encodeBlockAsm8B:
#ifdef GOAMD64_v3
	TZCNTQ R10, R10

#else
	BSFQ R10, R10

#endif
	SARQ $0x03, R10
	LEAL 8(R9)(R10*1), R9
	JMP  match_nolit_end_encodeBlockAsm8B

matchlen_match8_match_nolit_encodeBlockAsm8B:
	CMPL SI, $0x08
	JB   matchlen_match4_match_nolit_encodeBlockAsm8B
	MOVQ (DI)(R9*1), R8
	XORQ (BX)(R9*1), R8
	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm8B
	LEAL -8(SI), SI
	LEAL 8(R9), R9
	JMP  matchlen_match4_match_nolit_encodeBlockAsm8B

matchlen_bsf_8_match_nolit_encodeBlockAsm8B:
#ifdef GOAMD64_v3
	TZCNTQ R8, R8

#else
	BSFQ R8, R8

#endif
	SARQ $0x03, R8
	LEAL (R9)(R8*1), R9
	JMP  match_nolit_end_encodeBlockAsm8B

matchlen_match4_match_nolit_encodeBlockAsm8B:
	CMPL SI, $0x04
	JB   matchlen_match2_match_nolit_encodeBlockAsm8B
	MOVL (DI)(R9*1), R8
	CMPL (BX)(R9*1), R8
	JNE  matchlen_match2_match_nolit_encodeBlockAsm8B
	LEAL -4(SI), SI
	LEAL 4(R9), R9

matchlen_match2_match_nolit_encodeBlockAsm8B:
	CMPL SI, $0x01
	JE   matchlen_match1_match_nolit_encodeBlockAsm8B
	JB   match_nolit_end_encodeBlockAsm8B
	MOVW (DI)(R9*1), R8
	CMPW (BX)(R9*1), R8
	JNE  matchlen_match1_match_nolit_encodeBlockAsm8B
	LEAL 2(R9), R9
	SUBL $0x02, SI
	JZ   match_nolit_end_encodeBlockAsm8B

matchlen_match1_match_nolit_encodeBlockAsm8B:
	MOVB (DI)(R9*1), R8
	CMPB (BX)(R9*1), R8
	JNE  match_nolit_end_encodeBlockAsm8B
	LEAL 1(R9), R9

match_nolit_end_encodeBlockAsm8B:
	ADDL R9, CX
	MOVL 16(SP), BX
	ADDL $0x04, R9
	MOVL CX, 12(SP)

	// emitCopy
	CMPL R9, $0x40
	JBE  two_byte_offset_short_match_nolit_encodeBlockAsm8B
	CMPL BX, $0x00000800
	JAE  long_offset_short_match_nolit_encodeBlockAsm8B
	MOVL $0x00000001, SI
	LEAL 16(SI), SI
	MOVB BL, 1(AX)
	SHRL $0x08, BX
	SHLL $0x05, BX
	ORL  BX, SI
	MOVB SI, (AX)
	ADDQ $0x02, AX
	SUBL $0x08, R9

	// emitRepeat
	LEAL -4(R9), R9
	JMP  cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
	MOVL R9, BX
	LEAL -4(R9), R9
	CMPL BX, $0x08
	JBE  repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
	CMPL BX, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b

cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b:
	CMPL R9, $0x00000104
	JB   repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
	LEAL -256(R9), R9
	MOVW $0x0019, (AX)
	MOVW R9, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B

repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b:
	LEAL -4(R9), R9
	MOVW $0x0015, (AX)
	MOVB R9, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B

repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b:
	SHLL $0x02, R9
	ORL  $0x01, R9
	MOVW R9, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B
	XORQ SI, SI
	LEAL 1(SI)(R9*4), R9
	MOVB BL, 1(AX)
	SARL $0x08, BX
	SHLL $0x05, BX
	ORL  BX, R9
	MOVB R9, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B

long_offset_short_match_nolit_encodeBlockAsm8B:
	MOVB $0xee, (AX)
	MOVW BX, 1(AX)
	LEAL -60(R9), R9
	ADDQ $0x03, AX

	// emitRepeat
	MOVL R9, BX
	LEAL -4(R9), R9
	CMPL BX, $0x08
	JBE  repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short
	CMPL BX, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short

cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short:
	CMPL R9, $0x00000104
	JB   repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short
	LEAL -256(R9), R9
	MOVW $0x0019, (AX)
	MOVW R9, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B

repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short:
	LEAL -4(R9), R9
	MOVW $0x0015, (AX)
	MOVB R9, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B

repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short:
	SHLL $0x02, R9
	ORL  $0x01, R9
	MOVW R9, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B
	XORQ SI, SI
	LEAL 1(SI)(R9*4), R9
	MOVB BL, 1(AX)
	SARL $0x08, BX
	SHLL $0x05, BX
	ORL  BX, R9
	MOVB R9, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B

two_byte_offset_short_match_nolit_encodeBlockAsm8B:
	MOVL R9, SI
	SHLL $0x02, SI
	CMPL R9, $0x0c
	JAE  emit_copy_three_match_nolit_encodeBlockAsm8B
	LEAL -15(SI), SI
	MOVB BL, 1(AX)
	SHRL $0x08, BX
	SHLL $0x05, BX
	ORL  BX, SI
	MOVB SI, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B

emit_copy_three_match_nolit_encodeBlockAsm8B:
	LEAL -2(SI), SI
	MOVB SI, (AX)
	MOVW BX, 1(AX)
	ADDQ $0x03, AX

match_nolit_emitcopy_end_encodeBlockAsm8B:
	CMPL CX, 8(SP)
	JAE  emit_remainder_encodeBlockAsm8B
	MOVQ -2(DX)(CX*1), SI
	CMPQ AX, (SP)
	JB   match_nolit_dst_ok_encodeBlockAsm8B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_nolit_dst_ok_encodeBlockAsm8B:
	MOVQ  $0x9e3779b1, R8
	MOVQ  SI, DI
	SHRQ  $0x10, SI
	MOVQ  SI, BX
	SHLQ  $0x20, DI
	IMULQ R8, DI
	SHRQ  $0x38, DI
	SHLQ  $0x20, BX
	IMULQ R8, BX
	SHRQ  $0x38, BX
	LEAL  -2(CX), R8
	LEAQ  24(SP)(BX*4), R9
	MOVL  (R9), BX
	MOVL  R8, 24(SP)(DI*4)
	MOVL  CX, (R9)
	CMPL  (DX)(BX*1), SI
	JEQ   match_nolit_loop_encodeBlockAsm8B
	INCL  CX
	JMP   search_loop_encodeBlockAsm8B

emit_remainder_encodeBlockAsm8B:
	MOVQ src_len+32(FP), CX
	SUBL 12(SP), CX
	LEAQ 3(AX)(CX*1), CX
	CMPQ CX, (SP)
	JB   emit_remainder_ok_encodeBlockAsm8B
	MOVQ $0x00000000, ret+48(FP)
	RET

emit_remainder_ok_encodeBlockAsm8B:
	MOVQ src_len+32(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_encodeBlockAsm8B
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JB   one_byte_emit_remainder_encodeBlockAsm8B
	CMPL DX, $0x00000100
	JB   two_bytes_emit_remainder_encodeBlockAsm8B
	JB   three_bytes_emit_remainder_encodeBlockAsm8B

three_bytes_emit_remainder_encodeBlockAsm8B:
	MOVB $0xf4, (AX)
	MOVW DX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_encodeBlockAsm8B

two_bytes_emit_remainder_encodeBlockAsm8B:
	MOVB $0xf0, (AX)
	MOVB DL, 1(AX)
	ADDQ $0x02, AX
	CMPL DX, $0x40
	JB   memmove_emit_remainder_encodeBlockAsm8B
	JMP  memmove_long_emit_remainder_encodeBlockAsm8B

one_byte_emit_remainder_encodeBlockAsm8B:
	SHLB $0x02, DL
	MOVB DL, (AX)
	ADDQ $0x01, AX

memmove_emit_remainder_encodeBlockAsm8B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x03
	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2
	JE   emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2:
	MOVB (CX), SI
	MOVB -1(CX)(BX*1), CL
	MOVB SI, (AX)
	MOVB CL, -1(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm8B

emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3:
	MOVW (CX), SI
	MOVB 2(CX), CL
	MOVW SI, (AX)
	MOVB CL, 2(AX)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm8B

emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7:
	MOVL (CX), SI
	MOVL -4(CX)(BX*1), CX
	MOVL SI, (AX)
	MOVL CX, -4(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm8B

emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(BX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm8B

emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(BX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm8B

emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)

memmove_end_copy_emit_remainder_encodeBlockAsm8B:
	MOVQ DX, AX
	JMP  emit_literal_done_emit_remainder_encodeBlockAsm8B

memmove_long_emit_remainder_encodeBlockAsm8B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)
	MOVQ  DX, AX

emit_literal_done_emit_remainder_encodeBlockAsm8B:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ AX, ret+48(FP)
	RET

// func encodeBetterBlockAsm(dst []byte, src []byte) int
// Requires: BMI, SSE2
TEXT ·encodeBetterBlockAsm(SB), $589848-56
	MOVQ dst_base+0(FP), AX
	MOVQ $0x00001200, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_encodeBetterBlockAsm:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_encodeBetterBlockAsm
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), CX
	LEAQ  -6(CX), DX
	LEAQ  -8(CX), BX
	MOVL  BX, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  $0x00000000, 16(SP)
	MOVQ  src_base+24(FP), DX

search_loop_encodeBetterBlockAsm:
	MOVL CX, BX
	SUBL 12(SP), BX
	SHRL $0x07, BX
	CMPL BX, $0x63
	JBE  check_maxskip_ok_encodeBetterBlockAsm
	LEAL 100(CX), BX
	JMP  check_maxskip_cont_encodeBetterBlockAsm

check_maxskip_ok_encodeBetterBlockAsm:
	LEAL 1(CX)(BX*1), BX

check_maxskip_cont_encodeBetterBlockAsm:
	CMPL  BX, 8(SP)
	JAE   emit_remainder_encodeBetterBlockAsm
	MOVQ  (DX)(CX*1), SI
	MOVL  BX, 20(SP)
	MOVQ  $0x00cf1bbcdcbfa563, R8
	MOVQ  $0x9e3779b1, BX
	MOVQ  SI, R9
	MOVQ  SI, R10
	SHLQ  $0x08, R9
	IMULQ R8, R9
	SHRQ  $0x2f, R9
	SHLQ  $0x20, R10
	IMULQ BX, R10
	SHRQ  $0x32, R10
	MOVL  24(SP)(R9*4), BX
	MOVL  524312(SP)(R10*4), DI
	MOVL  CX, 24(SP)(R9*4)
	MOVL  CX, 524312(SP)(R10*4)
	MOVQ  (DX)(BX*1), R9
	MOVQ  (DX)(DI*1), R10
	CMPQ  R9, SI
	JEQ   candidate_match_encodeBetterBlockAsm
	CMPQ  R10, SI
	JNE   no_short_found_encodeBetterBlockAsm
	MOVL  DI, BX
	JMP   candidate_match_encodeBetterBlockAsm

no_short_found_encodeBetterBlockAsm:
	CMPL R9, SI
	JEQ  candidate_match_encodeBetterBlockAsm
	CMPL R10, SI
	JEQ  candidateS_match_encodeBetterBlockAsm
	MOVL 20(SP), CX
	JMP  search_loop_encodeBetterBlockAsm

candidateS_match_encodeBetterBlockAsm:
	SHRQ  $0x08, SI
	MOVQ  SI, R9
	SHLQ  $0x08, R9
	IMULQ R8, R9
	SHRQ  $0x2f, R9
	MOVL  24(SP)(R9*4), BX
	INCL  CX
	MOVL  CX, 24(SP)(R9*4)
	CMPL  (DX)(BX*1), SI
	JEQ   candidate_match_encodeBetterBlockAsm
	DECL  CX
	MOVL  DI, BX

candidate_match_encodeBetterBlockAsm:
	MOVL  12(SP), SI
	TESTL BX, BX
	JZ    match_extend_back_end_encodeBetterBlockAsm

match_extend_back_loop_encodeBetterBlockAsm:
	CMPL CX, SI
	JBE  match_extend_back_end_encodeBetterBlockAsm
	MOVB -1(DX)(BX*1), DI
	MOVB -1(DX)(CX*1), R8
	CMPB DI, R8
	JNE  match_extend_back_end_encodeBetterBlockAsm
	LEAL -1(CX), CX
	DECL BX
	JZ   match_extend_back_end_encodeBetterBlockAsm
	JMP  match_extend_back_loop_encodeBetterBlockAsm

match_extend_back_end_encodeBetterBlockAsm:
	MOVL CX, SI
	SUBL 12(SP), SI
	LEAQ 5(AX)(SI*1), SI
	CMPQ SI, (SP)
	JB   match_dst_size_check_encodeBetterBlockAsm
	MOVQ $0x00000000, ret+48(FP)
	RET

match_dst_size_check_encodeBetterBlockAsm:
	MOVL CX, SI
	ADDL $0x04, CX
	ADDL $0x04, BX
	MOVQ src_len+32(FP), DI
	SUBL CX, DI
	LEAQ (DX)(CX*1), R8
	LEAQ (DX)(BX*1), R9

	// matchLen
	XORL R11, R11

matchlen_loopback_16_match_nolit_encodeBetterBlockAsm:
	CMPL DI, $0x10
	JB   matchlen_match8_match_nolit_encodeBetterBlockAsm
	MOVQ (R8)(R11*1), R10
	MOVQ 8(R8)(R11*1), R12
	XORQ (R9)(R11*1), R10
	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm
	XORQ 8(R9)(R11*1), R12
	JNZ  matchlen_bsf_16match_nolit_encodeBetterBlockAsm
	LEAL -16(DI), DI
	LEAL 16(R11), R11
	JMP  matchlen_loopback_16_match_nolit_encodeBetterBlockAsm

matchlen_bsf_16match_nolit_encodeBetterBlockAsm:
#ifdef GOAMD64_v3
	TZCNTQ R12, R12

#else
	BSFQ R12, R12

#endif
	SARQ $0x03, R12
	LEAL 8(R11)(R12*1), R11
	JMP  match_nolit_end_encodeBetterBlockAsm

matchlen_match8_match_nolit_encodeBetterBlockAsm:
	CMPL DI, $0x08
	JB   matchlen_match4_match_nolit_encodeBetterBlockAsm
	MOVQ (R8)(R11*1), R10
	XORQ (R9)(R11*1), R10
	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm
	LEAL -8(DI), DI
	LEAL 8(R11), R11
	JMP  matchlen_match4_match_nolit_encodeBetterBlockAsm

matchlen_bsf_8_match_nolit_encodeBetterBlockAsm:
#ifdef GOAMD64_v3
	TZCNTQ R10, R10

#else
	BSFQ R10, R10

#endif
	SARQ $0x03, R10
	LEAL (R11)(R10*1), R11
	JMP  match_nolit_end_encodeBetterBlockAsm

matchlen_match4_match_nolit_encodeBetterBlockAsm:
	CMPL DI, $0x04
	JB   matchlen_match2_match_nolit_encodeBetterBlockAsm
	MOVL (R8)(R11*1), R10
	CMPL (R9)(R11*1), R10
	JNE  matchlen_match2_match_nolit_encodeBetterBlockAsm
	LEAL -4(DI), DI
	LEAL 4(R11), R11

matchlen_match2_match_nolit_encodeBetterBlockAsm:
	CMPL DI, $0x01
	JE   matchlen_match1_match_nolit_encodeBetterBlockAsm
	JB   match_nolit_end_encodeBetterBlockAsm
	MOVW (R8)(R11*1), R10
	CMPW (R9)(R11*1), R10
	JNE  matchlen_match1_match_nolit_encodeBetterBlockAsm
	LEAL 2(R11), R11
	SUBL $0x02, DI
	JZ   match_nolit_end_encodeBetterBlockAsm

matchlen_match1_match_nolit_encodeBetterBlockAsm:
	MOVB (R8)(R11*1), R10
	CMPB (R9)(R11*1), R10
	JNE  match_nolit_end_encodeBetterBlockAsm
	LEAL 1(R11), R11

match_nolit_end_encodeBetterBlockAsm:
	MOVL CX, DI
	SUBL BX, DI

	// Check if repeat
	CMPL 16(SP), DI
	JEQ  match_is_repeat_encodeBetterBlockAsm
	CMPL R11, $0x01
	JA   match_length_ok_encodeBetterBlockAsm
	CMPL DI, $0x0000ffff
	JBE  match_length_ok_encodeBetterBlockAsm
	MOVL 20(SP), CX
	INCL CX
	JMP  search_loop_encodeBetterBlockAsm

match_length_ok_encodeBetterBlockAsm:
	MOVL DI, 16(SP)
	MOVL 12(SP), BX
	CMPL BX, SI
	JEQ  emit_literal_done_match_emit_encodeBetterBlockAsm
	MOVL SI, R8
	MOVL SI, 12(SP)
	LEAQ (DX)(BX*1), R9
	SUBL BX, R8
	LEAL -1(R8), BX
	CMPL BX, $0x3c
	JB   one_byte_match_emit_encodeBetterBlockAsm
	CMPL BX, $0x00000100
	JB   two_bytes_match_emit_encodeBetterBlockAsm
	CMPL BX, $0x00010000
	JB   three_bytes_match_emit_encodeBetterBlockAsm
	CMPL BX, $0x01000000
	JB   four_bytes_match_emit_encodeBetterBlockAsm
	MOVB $0xfc, (AX)
	MOVL BX, 1(AX)
	ADDQ $0x05, AX
	JMP  memmove_long_match_emit_encodeBetterBlockAsm

four_bytes_match_emit_encodeBetterBlockAsm:
	MOVL BX, R10
	SHRL $0x10, R10
	MOVB $0xf8, (AX)
	MOVW BX, 1(AX)
	MOVB R10, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_match_emit_encodeBetterBlockAsm

three_bytes_match_emit_encodeBetterBlockAsm:
	MOVB $0xf4, (AX)
	MOVW BX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_encodeBetterBlockAsm

two_bytes_match_emit_encodeBetterBlockAsm:
	MOVB $0xf0, (AX)
	MOVB BL, 1(AX)
	ADDQ $0x02, AX
	CMPL BX, $0x40
	JB   memmove_match_emit_encodeBetterBlockAsm
	JMP  memmove_long_match_emit_encodeBetterBlockAsm

one_byte_match_emit_encodeBetterBlockAsm:
	SHLB $0x02, BL
	MOVB BL, (AX)
	ADDQ $0x01, AX

memmove_match_emit_encodeBetterBlockAsm:
	LEAQ (AX)(R8*1), BX

	// genMemMoveShort
	CMPQ R8, $0x04
	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4
	CMPQ R8, $0x08
	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64

emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4:
	MOVL (R9), R10
	MOVL R10, (AX)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm

emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7:
	MOVL (R9), R10
	MOVL -4(R9)(R8*1), R9
	MOVL R10, (AX)
	MOVL R9, -4(AX)(R8*1)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm

emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16:
	MOVQ (R9), R10
	MOVQ -8(R9)(R8*1), R9
	MOVQ R10, (AX)
	MOVQ R9, -8(AX)(R8*1)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm

emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32:
	MOVOU (R9), X0
	MOVOU -16(R9)(R8*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R8*1)
	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm

emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64:
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)

memmove_end_copy_match_emit_encodeBetterBlockAsm:
	MOVQ BX, AX
	JMP  emit_literal_done_match_emit_encodeBetterBlockAsm

memmove_long_match_emit_encodeBetterBlockAsm:
	LEAQ (AX)(R8*1), BX

	// genMemMoveLong
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVQ  R8, R12
	SHRQ  $0x05, R12
	MOVQ  AX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R13
	SUBQ  R10, R13
	DECQ  R12
	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
	LEAQ  -32(R9)(R13*1), R10
	LEAQ  -32(AX)(R13*1), R14

emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R14)
	MOVOA X5, 16(R14)
	ADDQ  $0x20, R14
	ADDQ  $0x20, R10
	ADDQ  $0x20, R13
	DECQ  R12
	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32:
	MOVOU -32(R9)(R13*1), X4
	MOVOU -16(R9)(R13*1), X5
	MOVOA X4, -32(AX)(R13*1)
	MOVOA X5, -16(AX)(R13*1)
	ADDQ  $0x20, R13
	CMPQ  R8, R13
	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)
	MOVQ  BX, AX

emit_literal_done_match_emit_encodeBetterBlockAsm:
	ADDL R11, CX
	ADDL $0x04, R11
	MOVL CX, 12(SP)

	// emitCopy
	CMPL DI, $0x00010000
	JB   two_byte_offset_match_nolit_encodeBetterBlockAsm
	CMPL R11, $0x40
	JBE  four_bytes_remain_match_nolit_encodeBetterBlockAsm
	MOVB $0xff, (AX)
	MOVL DI, 1(AX)
	LEAL -64(R11), R11
	ADDQ $0x05, AX
	CMPL R11, $0x04
	JB   four_bytes_remain_match_nolit_encodeBetterBlockAsm

	// emitRepeat
emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy:
	MOVL R11, BX
	LEAL -4(R11), R11
	CMPL BX, $0x08
	JBE  repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy
	CMPL BX, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy
	CMPL DI, $0x00000800
	JB   repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy

cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy:
	CMPL R11, $0x00000104
	JB   repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy
	CMPL R11, $0x00010100
	JB   repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy
	CMPL R11, $0x0100ffff
	JB   repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy
	LEAL -16842747(R11), R11
	MOVL $0xfffb001d, (AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	JMP  emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy

repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy:
	LEAL -65536(R11), R11
	MOVL R11, DI
	MOVW $0x001d, (AX)
	MOVW R11, 2(AX)
	SARL $0x10, DI
	MOVB DI, 4(AX)
	ADDQ $0x05, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy:
	LEAL -256(R11), R11
	MOVW $0x0019, (AX)
	MOVW R11, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy:
	LEAL -4(R11), R11
	MOVW $0x0015, (AX)
	MOVB R11, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy:
	SHLL $0x02, R11
	ORL  $0x01, R11
	MOVW R11, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy:
	XORQ BX, BX
	LEAL 1(BX)(R11*4), R11
	MOVB DI, 1(AX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, R11
	MOVB R11, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

four_bytes_remain_match_nolit_encodeBetterBlockAsm:
	TESTL R11, R11
	JZ    match_nolit_emitcopy_end_encodeBetterBlockAsm
	XORL  BX, BX
	LEAL  -1(BX)(R11*4), R11
	MOVB  R11, (AX)
	MOVL  DI, 1(AX)
	ADDQ  $0x05, AX
	JMP   match_nolit_emitcopy_end_encodeBetterBlockAsm

two_byte_offset_match_nolit_encodeBetterBlockAsm:
	CMPL R11, $0x40
	JBE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm
	CMPL DI, $0x00000800
	JAE  long_offset_short_match_nolit_encodeBetterBlockAsm
	MOVL $0x00000001, BX
	LEAL 16(BX), BX
	MOVB DI, 1(AX)
	MOVL DI, R8
	SHRL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, BX
	MOVB BL, (AX)
	ADDQ $0x02, AX
	SUBL $0x08, R11

	// emitRepeat
	LEAL -4(R11), R11
	JMP  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b

emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
	MOVL R11, BX
	LEAL -4(R11), R11
	CMPL BX, $0x08
	JBE  repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
	CMPL BX, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
	CMPL DI, $0x00000800
	JB   repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b

cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
	CMPL R11, $0x00000104
	JB   repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
	CMPL R11, $0x00010100
	JB   repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
	CMPL R11, $0x0100ffff
	JB   repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
	LEAL -16842747(R11), R11
	MOVL $0xfffb001d, (AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	JMP  emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b

repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
	LEAL -65536(R11), R11
	MOVL R11, DI
	MOVW $0x001d, (AX)
	MOVW R11, 2(AX)
	SARL $0x10, DI
	MOVB DI, 4(AX)
	ADDQ $0x05, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
	LEAL -256(R11), R11
	MOVW $0x0019, (AX)
	MOVW R11, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
	LEAL -4(R11), R11
	MOVW $0x0015, (AX)
	MOVB R11, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
	SHLL $0x02, R11
	ORL  $0x01, R11
	MOVW R11, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
	XORQ BX, BX
	LEAL 1(BX)(R11*4), R11
	MOVB DI, 1(AX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, R11
	MOVB R11, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

long_offset_short_match_nolit_encodeBetterBlockAsm:
	MOVB $0xee, (AX)
	MOVW DI, 1(AX)
	LEAL -60(R11), R11
	ADDQ $0x03, AX

	// emitRepeat
emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short:
	MOVL R11, BX
	LEAL -4(R11), R11
	CMPL BX, $0x08
	JBE  repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short
	CMPL BX, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short
	CMPL DI, $0x00000800
	JB   repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short

cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short:
	CMPL R11, $0x00000104
	JB   repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short
	CMPL R11, $0x00010100
	JB   repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short
	CMPL R11, $0x0100ffff
	JB   repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short
	LEAL -16842747(R11), R11
	MOVL $0xfffb001d, (AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	JMP  emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short

repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short:
	LEAL -65536(R11), R11
	MOVL R11, DI
	MOVW $0x001d, (AX)
	MOVW R11, 2(AX)
	SARL $0x10, DI
	MOVB DI, 4(AX)
	ADDQ $0x05, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short:
	LEAL -256(R11), R11
	MOVW $0x0019, (AX)
	MOVW R11, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short:
	LEAL -4(R11), R11
	MOVW $0x0015, (AX)
	MOVB R11, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short:
	SHLL $0x02, R11
	ORL  $0x01, R11
	MOVW R11, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short:
	XORQ BX, BX
	LEAL 1(BX)(R11*4), R11
	MOVB DI, 1(AX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, R11
	MOVB R11, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

two_byte_offset_short_match_nolit_encodeBetterBlockAsm:
	MOVL R11, BX
	SHLL $0x02, BX
	CMPL R11, $0x0c
	JAE  emit_copy_three_match_nolit_encodeBetterBlockAsm
	CMPL DI, $0x00000800
	JAE  emit_copy_three_match_nolit_encodeBetterBlockAsm
	LEAL -15(BX), BX
	MOVB DI, 1(AX)
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, BX
	MOVB BL, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

emit_copy_three_match_nolit_encodeBetterBlockAsm:
	LEAL -2(BX), BX
	MOVB BL, (AX)
	MOVW DI, 1(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

match_is_repeat_encodeBetterBlockAsm:
	MOVL 12(SP), BX
	CMPL BX, SI
	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm
	MOVL SI, R8
	MOVL SI, 12(SP)
	LEAQ (DX)(BX*1), R9
	SUBL BX, R8
	LEAL -1(R8), BX
	CMPL BX, $0x3c
	JB   one_byte_match_emit_repeat_encodeBetterBlockAsm
	CMPL BX, $0x00000100
	JB   two_bytes_match_emit_repeat_encodeBetterBlockAsm
	CMPL BX, $0x00010000
	JB   three_bytes_match_emit_repeat_encodeBetterBlockAsm
	CMPL BX, $0x01000000
	JB   four_bytes_match_emit_repeat_encodeBetterBlockAsm
	MOVB $0xfc, (AX)
	MOVL BX, 1(AX)
	ADDQ $0x05, AX
	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm

four_bytes_match_emit_repeat_encodeBetterBlockAsm:
	MOVL BX, R10
	SHRL $0x10, R10
	MOVB $0xf8, (AX)
	MOVW BX, 1(AX)
	MOVB R10, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm

three_bytes_match_emit_repeat_encodeBetterBlockAsm:
	MOVB $0xf4, (AX)
	MOVW BX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm

two_bytes_match_emit_repeat_encodeBetterBlockAsm:
	MOVB $0xf0, (AX)
	MOVB BL, 1(AX)
	ADDQ $0x02, AX
	CMPL BX, $0x40
	JB   memmove_match_emit_repeat_encodeBetterBlockAsm
	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm

one_byte_match_emit_repeat_encodeBetterBlockAsm:
	SHLB $0x02, BL
	MOVB BL, (AX)
	ADDQ $0x01, AX

memmove_match_emit_repeat_encodeBetterBlockAsm:
	LEAQ (AX)(R8*1), BX

	// genMemMoveShort
	CMPQ R8, $0x04
	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4
	CMPQ R8, $0x08
	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4:
	MOVL (R9), R10
	MOVL R10, (AX)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7:
	MOVL (R9), R10
	MOVL -4(R9)(R8*1), R9
	MOVL R10, (AX)
	MOVL R9, -4(AX)(R8*1)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16:
	MOVQ (R9), R10
	MOVQ -8(R9)(R8*1), R9
	MOVQ R10, (AX)
	MOVQ R9, -8(AX)(R8*1)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32:
	MOVOU (R9), X0
	MOVOU -16(R9)(R8*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R8*1)
	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64:
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)

memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm:
	MOVQ BX, AX
	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm

memmove_long_match_emit_repeat_encodeBetterBlockAsm:
	LEAQ (AX)(R8*1), BX

	// genMemMoveLong
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVQ  R8, R12
	SHRQ  $0x05, R12
	MOVQ  AX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R13
	SUBQ  R10, R13
	DECQ  R12
	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32
	LEAQ  -32(R9)(R13*1), R10
	LEAQ  -32(AX)(R13*1), R14

emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R14)
	MOVOA X5, 16(R14)
	ADDQ  $0x20, R14
	ADDQ  $0x20, R10
	ADDQ  $0x20, R13
	DECQ  R12
	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back

emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32:
	MOVOU -32(R9)(R13*1), X4
	MOVOU -16(R9)(R13*1), X5
	MOVOA X4, -32(AX)(R13*1)
	MOVOA X5, -16(AX)(R13*1)
	ADDQ  $0x20, R13
	CMPQ  R8, R13
	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)
	MOVQ  BX, AX

emit_literal_done_match_emit_repeat_encodeBetterBlockAsm:
	ADDL R11, CX
	ADDL $0x04, R11
	MOVL CX, 12(SP)

	// emitRepeat
emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm:
	MOVL R11, BX
	LEAL -4(R11), R11
	CMPL BX, $0x08
	JBE  repeat_two_match_nolit_repeat_encodeBetterBlockAsm
	CMPL BX, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm
	CMPL DI, $0x00000800
	JB   repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm

cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm:
	CMPL R11, $0x00000104
	JB   repeat_three_match_nolit_repeat_encodeBetterBlockAsm
	CMPL R11, $0x00010100
	JB   repeat_four_match_nolit_repeat_encodeBetterBlockAsm
	CMPL R11, $0x0100ffff
	JB   repeat_five_match_nolit_repeat_encodeBetterBlockAsm
	LEAL -16842747(R11), R11
	MOVL $0xfffb001d, (AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	JMP  emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm

repeat_five_match_nolit_repeat_encodeBetterBlockAsm:
	LEAL -65536(R11), R11
	MOVL R11, DI
	MOVW $0x001d, (AX)
	MOVW R11, 2(AX)
	SARL $0x10, DI
	MOVB DI, 4(AX)
	ADDQ $0x05, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_four_match_nolit_repeat_encodeBetterBlockAsm:
	LEAL -256(R11), R11
	MOVW $0x0019, (AX)
	MOVW R11, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_three_match_nolit_repeat_encodeBetterBlockAsm:
	LEAL -4(R11), R11
	MOVW $0x0015, (AX)
	MOVB R11, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_two_match_nolit_repeat_encodeBetterBlockAsm:
	SHLL $0x02, R11
	ORL  $0x01, R11
	MOVW R11, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm:
	XORQ BX, BX
	LEAL 1(BX)(R11*4), R11
	MOVB DI, 1(AX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, R11
	MOVB R11, (AX)
	ADDQ $0x02, AX

match_nolit_emitcopy_end_encodeBetterBlockAsm:
	CMPL CX, 8(SP)
	JAE  emit_remainder_encodeBetterBlockAsm
	CMPQ AX, (SP)
	JB   match_nolit_dst_ok_encodeBetterBlockAsm
	MOVQ $0x00000000, ret+48(FP)
	RET

match_nolit_dst_ok_encodeBetterBlockAsm:
	MOVQ  $0x00cf1bbcdcbfa563, BX
	MOVQ  $0x9e3779b1, DI
	LEAQ  1(SI), SI
	LEAQ  -2(CX), R8
	MOVQ  (DX)(SI*1), R9
	MOVQ  1(DX)(SI*1), R10
	MOVQ  (DX)(R8*1), R11
	MOVQ  1(DX)(R8*1), R12
	SHLQ  $0x08, R9
	IMULQ BX, R9
	SHRQ  $0x2f, R9
	SHLQ  $0x20, R10
	IMULQ DI, R10
	SHRQ  $0x32, R10
	SHLQ  $0x08, R11
	IMULQ BX, R11
	SHRQ  $0x2f, R11
	SHLQ  $0x20, R12
	IMULQ DI, R12
	SHRQ  $0x32, R12
	LEAQ  1(SI), DI
	LEAQ  1(R8), R13
	MOVL  SI, 24(SP)(R9*4)
	MOVL  R8, 24(SP)(R11*4)
	MOVL  DI, 524312(SP)(R10*4)
	MOVL  R13, 524312(SP)(R12*4)
	LEAQ  1(R8)(SI*1), DI
	SHRQ  $0x01, DI
	ADDQ  $0x01, SI
	SUBQ  $0x01, R8

index_loop_encodeBetterBlockAsm:
	CMPQ  DI, R8
	JAE   search_loop_encodeBetterBlockAsm
	MOVQ  (DX)(SI*1), R9
	MOVQ  (DX)(DI*1), R10
	SHLQ  $0x08, R9
	IMULQ BX, R9
	SHRQ  $0x2f, R9
	SHLQ  $0x08, R10
	IMULQ BX, R10
	SHRQ  $0x2f, R10
	MOVL  SI, 24(SP)(R9*4)
	MOVL  DI, 24(SP)(R10*4)
	ADDQ  $0x02, SI
	ADDQ  $0x02, DI
	JMP   index_loop_encodeBetterBlockAsm

emit_remainder_encodeBetterBlockAsm:
	MOVQ src_len+32(FP), CX
	SUBL 12(SP), CX
	LEAQ 5(AX)(CX*1), CX
	CMPQ CX, (SP)
	JB   emit_remainder_ok_encodeBetterBlockAsm
	MOVQ $0x00000000, ret+48(FP)
	RET

emit_remainder_ok_encodeBetterBlockAsm:
	MOVQ src_len+32(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JB   one_byte_emit_remainder_encodeBetterBlockAsm
	CMPL DX, $0x00000100
	JB   two_bytes_emit_remainder_encodeBetterBlockAsm
	CMPL DX, $0x00010000
	JB   three_bytes_emit_remainder_encodeBetterBlockAsm
	CMPL DX, $0x01000000
	JB   four_bytes_emit_remainder_encodeBetterBlockAsm
	MOVB $0xfc, (AX)
	MOVL DX, 1(AX)
	ADDQ $0x05, AX
	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm

four_bytes_emit_remainder_encodeBetterBlockAsm:
	MOVL DX, BX
	SHRL $0x10, BX
	MOVB $0xf8, (AX)
	MOVW DX, 1(AX)
	MOVB BL, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm

three_bytes_emit_remainder_encodeBetterBlockAsm:
	MOVB $0xf4, (AX)
	MOVW DX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm

two_bytes_emit_remainder_encodeBetterBlockAsm:
	MOVB $0xf0, (AX)
	MOVB DL, 1(AX)
	ADDQ $0x02, AX
	CMPL DX, $0x40
	JB   memmove_emit_remainder_encodeBetterBlockAsm
	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm

one_byte_emit_remainder_encodeBetterBlockAsm:
	SHLB $0x02, DL
	MOVB DL, (AX)
	ADDQ $0x01, AX

memmove_emit_remainder_encodeBetterBlockAsm:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x03
	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2
	JE   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2:
	MOVB (CX), SI
	MOVB -1(CX)(BX*1), CL
	MOVB SI, (AX)
	MOVB CL, -1(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3:
	MOVW (CX), SI
	MOVB 2(CX), CL
	MOVW SI, (AX)
	MOVB CL, 2(AX)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7:
	MOVL (CX), SI
	MOVL -4(CX)(BX*1), CX
	MOVL SI, (AX)
	MOVL CX, -4(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(BX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(BX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)

memmove_end_copy_emit_remainder_encodeBetterBlockAsm:
	MOVQ DX, AX
	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm

memmove_long_emit_remainder_encodeBetterBlockAsm:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)
	MOVQ  DX, AX

emit_literal_done_emit_remainder_encodeBetterBlockAsm:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ AX, ret+48(FP)
	RET

// func encodeBetterBlockAsm4MB(dst []byte, src []byte) int
// Requires: BMI, SSE2
TEXT ·encodeBetterBlockAsm4MB(SB), $589848-56
	MOVQ dst_base+0(FP), AX
	MOVQ $0x00001200, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_encodeBetterBlockAsm4MB:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_encodeBetterBlockAsm4MB
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), CX
	LEAQ  -6(CX), DX
	LEAQ  -8(CX), BX
	MOVL  BX, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  $0x00000000, 16(SP)
	MOVQ  src_base+24(FP), DX

search_loop_encodeBetterBlockAsm4MB:
	MOVL CX, BX
	SUBL 12(SP), BX
	SHRL $0x07, BX
	CMPL BX, $0x63
	JBE  check_maxskip_ok_encodeBetterBlockAsm4MB
	LEAL 100(CX), BX
	JMP  check_maxskip_cont_encodeBetterBlockAsm4MB

check_maxskip_ok_encodeBetterBlockAsm4MB:
	LEAL 1(CX)(BX*1), BX

check_maxskip_cont_encodeBetterBlockAsm4MB:
	CMPL  BX, 8(SP)
	JAE   emit_remainder_encodeBetterBlockAsm4MB
	MOVQ  (DX)(CX*1), SI
	MOVL  BX, 20(SP)
	MOVQ  $0x00cf1bbcdcbfa563, R8
	MOVQ  $0x9e3779b1, BX
	MOVQ  SI, R9
	MOVQ  SI, R10
	SHLQ  $0x08, R9
	IMULQ R8, R9
	SHRQ  $0x2f, R9
	SHLQ  $0x20, R10
	IMULQ BX, R10
	SHRQ  $0x32, R10
	MOVL  24(SP)(R9*4), BX
	MOVL  524312(SP)(R10*4), DI
	MOVL  CX, 24(SP)(R9*4)
	MOVL  CX, 524312(SP)(R10*4)
	MOVQ  (DX)(BX*1), R9
	MOVQ  (DX)(DI*1), R10
	CMPQ  R9, SI
	JEQ   candidate_match_encodeBetterBlockAsm4MB
	CMPQ  R10, SI
	JNE   no_short_found_encodeBetterBlockAsm4MB
	MOVL  DI, BX
	JMP   candidate_match_encodeBetterBlockAsm4MB

no_short_found_encodeBetterBlockAsm4MB:
	CMPL R9, SI
	JEQ  candidate_match_encodeBetterBlockAsm4MB
	CMPL R10, SI
	JEQ  candidateS_match_encodeBetterBlockAsm4MB
	MOVL 20(SP), CX
	JMP  search_loop_encodeBetterBlockAsm4MB

candidateS_match_encodeBetterBlockAsm4MB:
	SHRQ  $0x08, SI
	MOVQ  SI, R9
	SHLQ  $0x08, R9
	IMULQ R8, R9
	SHRQ  $0x2f, R9
	MOVL  24(SP)(R9*4), BX
	INCL  CX
	MOVL  CX, 24(SP)(R9*4)
	CMPL  (DX)(BX*1), SI
	JEQ   candidate_match_encodeBetterBlockAsm4MB
	DECL  CX
	MOVL  DI, BX

candidate_match_encodeBetterBlockAsm4MB:
	MOVL  12(SP), SI
	TESTL BX, BX
	JZ    match_extend_back_end_encodeBetterBlockAsm4MB

match_extend_back_loop_encodeBetterBlockAsm4MB:
	CMPL CX, SI
	JBE  match_extend_back_end_encodeBetterBlockAsm4MB
	MOVB -1(DX)(BX*1), DI
	MOVB -1(DX)(CX*1), R8
	CMPB DI, R8
	JNE  match_extend_back_end_encodeBetterBlockAsm4MB
	LEAL -1(CX), CX
	DECL BX
	JZ   match_extend_back_end_encodeBetterBlockAsm4MB
	JMP  match_extend_back_loop_encodeBetterBlockAsm4MB

match_extend_back_end_encodeBetterBlockAsm4MB:
	MOVL CX, SI
	SUBL 12(SP), SI
	LEAQ 4(AX)(SI*1), SI
	CMPQ SI, (SP)
	JB   match_dst_size_check_encodeBetterBlockAsm4MB
	MOVQ $0x00000000, ret+48(FP)
	RET

match_dst_size_check_encodeBetterBlockAsm4MB:
	MOVL CX, SI
	ADDL $0x04, CX
	ADDL $0x04, BX
	MOVQ src_len+32(FP), DI
	SUBL CX, DI
	LEAQ (DX)(CX*1), R8
	LEAQ (DX)(BX*1), R9

	// matchLen
	XORL R11, R11

matchlen_loopback_16_match_nolit_encodeBetterBlockAsm4MB:
	CMPL DI, $0x10
	JB   matchlen_match8_match_nolit_encodeBetterBlockAsm4MB
	MOVQ (R8)(R11*1), R10
	MOVQ 8(R8)(R11*1), R12
	XORQ (R9)(R11*1), R10
	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4MB
	XORQ 8(R9)(R11*1), R12
	JNZ  matchlen_bsf_16match_nolit_encodeBetterBlockAsm4MB
	LEAL -16(DI), DI
	LEAL 16(R11), R11
	JMP  matchlen_loopback_16_match_nolit_encodeBetterBlockAsm4MB

matchlen_bsf_16match_nolit_encodeBetterBlockAsm4MB:
#ifdef GOAMD64_v3
	TZCNTQ R12, R12

#else
	BSFQ R12, R12

#endif
	SARQ $0x03, R12
	LEAL 8(R11)(R12*1), R11
	JMP  match_nolit_end_encodeBetterBlockAsm4MB

matchlen_match8_match_nolit_encodeBetterBlockAsm4MB:
	CMPL DI, $0x08
	JB   matchlen_match4_match_nolit_encodeBetterBlockAsm4MB
	MOVQ (R8)(R11*1), R10
	XORQ (R9)(R11*1), R10
	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4MB
	LEAL -8(DI), DI
	LEAL 8(R11), R11
	JMP  matchlen_match4_match_nolit_encodeBetterBlockAsm4MB

matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4MB:
#ifdef GOAMD64_v3
	TZCNTQ R10, R10

#else
	BSFQ R10, R10

#endif
	SARQ $0x03, R10
	LEAL (R11)(R10*1), R11
	JMP  match_nolit_end_encodeBetterBlockAsm4MB

matchlen_match4_match_nolit_encodeBetterBlockAsm4MB:
	CMPL DI, $0x04
	JB   matchlen_match2_match_nolit_encodeBetterBlockAsm4MB
	MOVL (R8)(R11*1), R10
	CMPL (R9)(R11*1), R10
	JNE  matchlen_match2_match_nolit_encodeBetterBlockAsm4MB
	LEAL -4(DI), DI
	LEAL 4(R11), R11

matchlen_match2_match_nolit_encodeBetterBlockAsm4MB:
	CMPL DI, $0x01
	JE   matchlen_match1_match_nolit_encodeBetterBlockAsm4MB
	JB   match_nolit_end_encodeBetterBlockAsm4MB
	MOVW (R8)(R11*1), R10
	CMPW (R9)(R11*1), R10
	JNE  matchlen_match1_match_nolit_encodeBetterBlockAsm4MB
	LEAL 2(R11), R11
	SUBL $0x02, DI
	JZ   match_nolit_end_encodeBetterBlockAsm4MB

matchlen_match1_match_nolit_encodeBetterBlockAsm4MB:
	MOVB (R8)(R11*1), R10
	CMPB (R9)(R11*1), R10
	JNE  match_nolit_end_encodeBetterBlockAsm4MB
	LEAL 1(R11), R11

match_nolit_end_encodeBetterBlockAsm4MB:
	MOVL CX, DI
	SUBL BX, DI

	// Check if repeat
	CMPL 16(SP), DI
	JEQ  match_is_repeat_encodeBetterBlockAsm4MB
	CMPL R11, $0x01
	JA   match_length_ok_encodeBetterBlockAsm4MB
	CMPL DI, $0x0000ffff
	JBE  match_length_ok_encodeBetterBlockAsm4MB
	MOVL 20(SP), CX
	INCL CX
	JMP  search_loop_encodeBetterBlockAsm4MB

match_length_ok_encodeBetterBlockAsm4MB:
	MOVL DI, 16(SP)
	MOVL 12(SP), BX
	CMPL BX, SI
	JEQ  emit_literal_done_match_emit_encodeBetterBlockAsm4MB
	MOVL SI, R8
	MOVL SI, 12(SP)
	LEAQ (DX)(BX*1), R9
	SUBL BX, R8
	LEAL -1(R8), BX
	CMPL BX, $0x3c
	JB   one_byte_match_emit_encodeBetterBlockAsm4MB
	CMPL BX, $0x00000100
	JB   two_bytes_match_emit_encodeBetterBlockAsm4MB
	CMPL BX, $0x00010000
	JB   three_bytes_match_emit_encodeBetterBlockAsm4MB
	MOVL BX, R10
	SHRL $0x10, R10
	MOVB $0xf8, (AX)
	MOVW BX, 1(AX)
	MOVB R10, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_match_emit_encodeBetterBlockAsm4MB

three_bytes_match_emit_encodeBetterBlockAsm4MB:
	MOVB $0xf4, (AX)
	MOVW BX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_encodeBetterBlockAsm4MB

two_bytes_match_emit_encodeBetterBlockAsm4MB:
	MOVB $0xf0, (AX)
	MOVB BL, 1(AX)
	ADDQ $0x02, AX
	CMPL BX, $0x40
	JB   memmove_match_emit_encodeBetterBlockAsm4MB
	JMP  memmove_long_match_emit_encodeBetterBlockAsm4MB

one_byte_match_emit_encodeBetterBlockAsm4MB:
	SHLB $0x02, BL
	MOVB BL, (AX)
	ADDQ $0x01, AX

memmove_match_emit_encodeBetterBlockAsm4MB:
	LEAQ (AX)(R8*1), BX

	// genMemMoveShort
	CMPQ R8, $0x04
	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4
	CMPQ R8, $0x08
	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64

emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4:
	MOVL (R9), R10
	MOVL R10, (AX)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm4MB

emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7:
	MOVL (R9), R10
	MOVL -4(R9)(R8*1), R9
	MOVL R10, (AX)
	MOVL R9, -4(AX)(R8*1)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm4MB

emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16:
	MOVQ (R9), R10
	MOVQ -8(R9)(R8*1), R9
	MOVQ R10, (AX)
	MOVQ R9, -8(AX)(R8*1)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm4MB

emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32:
	MOVOU (R9), X0
	MOVOU -16(R9)(R8*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R8*1)
	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm4MB

emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64:
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)

memmove_end_copy_match_emit_encodeBetterBlockAsm4MB:
	MOVQ BX, AX
	JMP  emit_literal_done_match_emit_encodeBetterBlockAsm4MB

memmove_long_match_emit_encodeBetterBlockAsm4MB:
	LEAQ (AX)(R8*1), BX

	// genMemMoveLong
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVQ  R8, R12
	SHRQ  $0x05, R12
	MOVQ  AX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R13
	SUBQ  R10, R13
	DECQ  R12
	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
	LEAQ  -32(R9)(R13*1), R10
	LEAQ  -32(AX)(R13*1), R14

emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R14)
	MOVOA X5, 16(R14)
	ADDQ  $0x20, R14
	ADDQ  $0x20, R10
	ADDQ  $0x20, R13
	DECQ  R12
	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
	MOVOU -32(R9)(R13*1), X4
	MOVOU -16(R9)(R13*1), X5
	MOVOA X4, -32(AX)(R13*1)
	MOVOA X5, -16(AX)(R13*1)
	ADDQ  $0x20, R13
	CMPQ  R8, R13
	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)
	MOVQ  BX, AX

emit_literal_done_match_emit_encodeBetterBlockAsm4MB:
	ADDL R11, CX
	ADDL $0x04, R11
	MOVL CX, 12(SP)

	// emitCopy
	CMPL DI, $0x00010000
	JB   two_byte_offset_match_nolit_encodeBetterBlockAsm4MB
	CMPL R11, $0x40
	JBE  four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB
	MOVB $0xff, (AX)
	MOVL DI, 1(AX)
	LEAL -64(R11), R11
	ADDQ $0x05, AX
	CMPL R11, $0x04
	JB   four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB

	// emitRepeat
	MOVL R11, BX
	LEAL -4(R11), R11
	CMPL BX, $0x08
	JBE  repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy
	CMPL BX, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy
	CMPL DI, $0x00000800
	JB   repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy

cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
	CMPL R11, $0x00000104
	JB   repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy
	CMPL R11, $0x00010100
	JB   repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy
	LEAL -65536(R11), R11
	MOVL R11, DI
	MOVW $0x001d, (AX)
	MOVW R11, 2(AX)
	SARL $0x10, DI
	MOVB DI, 4(AX)
	ADDQ $0x05, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
	LEAL -256(R11), R11
	MOVW $0x0019, (AX)
	MOVW R11, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
	LEAL -4(R11), R11
	MOVW $0x0015, (AX)
	MOVB R11, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
	SHLL $0x02, R11
	ORL  $0x01, R11
	MOVW R11, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
	XORQ BX, BX
	LEAL 1(BX)(R11*4), R11
	MOVB DI, 1(AX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, R11
	MOVB R11, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB:
	TESTL R11, R11
	JZ    match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
	XORL  BX, BX
	LEAL  -1(BX)(R11*4), R11
	MOVB  R11, (AX)
	MOVL  DI, 1(AX)
	ADDQ  $0x05, AX
	JMP   match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

two_byte_offset_match_nolit_encodeBetterBlockAsm4MB:
	CMPL R11, $0x40
	JBE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB
	CMPL DI, $0x00000800
	JAE  long_offset_short_match_nolit_encodeBetterBlockAsm4MB
	MOVL $0x00000001, BX
	LEAL 16(BX), BX
	MOVB DI, 1(AX)
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, BX
	MOVB BL, (AX)
	ADDQ $0x02, AX
	SUBL $0x08, R11

	// emitRepeat
	LEAL -4(R11), R11
	JMP  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
	MOVL R11, BX
	LEAL -4(R11), R11
	CMPL BX, $0x08
	JBE  repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
	CMPL BX, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
	CMPL DI, $0x00000800
	JB   repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b

cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
	CMPL R11, $0x00000104
	JB   repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
	CMPL R11, $0x00010100
	JB   repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
	LEAL -65536(R11), R11
	MOVL R11, DI
	MOVW $0x001d, (AX)
	MOVW R11, 2(AX)
	SARL $0x10, DI
	MOVB DI, 4(AX)
	ADDQ $0x05, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
	LEAL -256(R11), R11
	MOVW $0x0019, (AX)
	MOVW R11, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
	LEAL -4(R11), R11
	MOVW $0x0015, (AX)
	MOVB R11, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
	SHLL $0x02, R11
	ORL  $0x01, R11
	MOVW R11, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
	XORQ BX, BX
	LEAL 1(BX)(R11*4), R11
	MOVB DI, 1(AX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, R11
	MOVB R11, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

long_offset_short_match_nolit_encodeBetterBlockAsm4MB:
	MOVB $0xee, (AX)
	MOVW DI, 1(AX)
	LEAL -60(R11), R11
	ADDQ $0x03, AX

	// emitRepeat
	MOVL R11, BX
	LEAL -4(R11), R11
	CMPL BX, $0x08
	JBE  repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
	CMPL BX, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
	CMPL DI, $0x00000800
	JB   repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short

cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
	CMPL R11, $0x00000104
	JB   repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
	CMPL R11, $0x00010100
	JB   repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
	LEAL -65536(R11), R11
	MOVL R11, DI
	MOVW $0x001d, (AX)
	MOVW R11, 2(AX)
	SARL $0x10, DI
	MOVB DI, 4(AX)
	ADDQ $0x05, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
	LEAL -256(R11), R11
	MOVW $0x0019, (AX)
	MOVW R11, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
	LEAL -4(R11), R11
	MOVW $0x0015, (AX)
	MOVB R11, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
	SHLL $0x02, R11
	ORL  $0x01, R11
	MOVW R11, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
	XORQ BX, BX
	LEAL 1(BX)(R11*4), R11
	MOVB DI, 1(AX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, R11
	MOVB R11, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB:
	MOVL R11, BX
	SHLL $0x02, BX
	CMPL R11, $0x0c
	JAE  emit_copy_three_match_nolit_encodeBetterBlockAsm4MB
	CMPL DI, $0x00000800
	JAE  emit_copy_three_match_nolit_encodeBetterBlockAsm4MB
	LEAL -15(BX), BX
	MOVB DI, 1(AX)
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, BX
	MOVB BL, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

emit_copy_three_match_nolit_encodeBetterBlockAsm4MB:
	LEAL -2(BX), BX
	MOVB BL, (AX)
	MOVW DI, 1(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

match_is_repeat_encodeBetterBlockAsm4MB:
	MOVL 12(SP), BX
	CMPL BX, SI
	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB
	MOVL SI, R8
	MOVL SI, 12(SP)
	LEAQ (DX)(BX*1), R9
	SUBL BX, R8
	LEAL -1(R8), BX
	CMPL BX, $0x3c
	JB   one_byte_match_emit_repeat_encodeBetterBlockAsm4MB
	CMPL BX, $0x00000100
	JB   two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB
	CMPL BX, $0x00010000
	JB   three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB
	MOVL BX, R10
	SHRL $0x10, R10
	MOVB $0xf8, (AX)
	MOVW BX, 1(AX)
	MOVB R10, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB

three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB:
	MOVB $0xf4, (AX)
	MOVW BX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB

two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB:
	MOVB $0xf0, (AX)
	MOVB BL, 1(AX)
	ADDQ $0x02, AX
	CMPL BX, $0x40
	JB   memmove_match_emit_repeat_encodeBetterBlockAsm4MB
	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB

one_byte_match_emit_repeat_encodeBetterBlockAsm4MB:
	SHLB $0x02, BL
	MOVB BL, (AX)
	ADDQ $0x01, AX

memmove_match_emit_repeat_encodeBetterBlockAsm4MB:
	LEAQ (AX)(R8*1), BX

	// genMemMoveShort
	CMPQ R8, $0x04
	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4
	CMPQ R8, $0x08
	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4:
	MOVL (R9), R10
	MOVL R10, (AX)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7:
	MOVL (R9), R10
	MOVL -4(R9)(R8*1), R9
	MOVL R10, (AX)
	MOVL R9, -4(AX)(R8*1)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16:
	MOVQ (R9), R10
	MOVQ -8(R9)(R8*1), R9
	MOVQ R10, (AX)
	MOVQ R9, -8(AX)(R8*1)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32:
	MOVOU (R9), X0
	MOVOU -16(R9)(R8*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R8*1)
	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64:
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)

memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB:
	MOVQ BX, AX
	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB

memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB:
	LEAQ (AX)(R8*1), BX

	// genMemMoveLong
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVQ  R8, R12
	SHRQ  $0x05, R12
	MOVQ  AX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R13
	SUBQ  R10, R13
	DECQ  R12
	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
	LEAQ  -32(R9)(R13*1), R10
	LEAQ  -32(AX)(R13*1), R14

emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R14)
	MOVOA X5, 16(R14)
	ADDQ  $0x20, R14
	ADDQ  $0x20, R10
	ADDQ  $0x20, R13
	DECQ  R12
	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back

emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
	MOVOU -32(R9)(R13*1), X4
	MOVOU -16(R9)(R13*1), X5
	MOVOA X4, -32(AX)(R13*1)
	MOVOA X5, -16(AX)(R13*1)
	ADDQ  $0x20, R13
	CMPQ  R8, R13
	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)
	MOVQ  BX, AX

emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB:
	ADDL R11, CX
	ADDL $0x04, R11
	MOVL CX, 12(SP)

	// emitRepeat
	MOVL R11, BX
	LEAL -4(R11), R11
	CMPL BX, $0x08
	JBE  repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB
	CMPL BX, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB
	CMPL DI, $0x00000800
	JB   repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB

cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB:
	CMPL R11, $0x00000104
	JB   repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB
	CMPL R11, $0x00010100
	JB   repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB
	LEAL -65536(R11), R11
	MOVL R11, DI
	MOVW $0x001d, (AX)
	MOVW R11, 2(AX)
	SARL $0x10, DI
	MOVB DI, 4(AX)
	ADDQ $0x05, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB:
	LEAL -256(R11), R11
	MOVW $0x0019, (AX)
	MOVW R11, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB:
	LEAL -4(R11), R11
	MOVW $0x0015, (AX)
	MOVB R11, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB:
	SHLL $0x02, R11
	ORL  $0x01, R11
	MOVW R11, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB:
	XORQ BX, BX
	LEAL 1(BX)(R11*4), R11
	MOVB DI, 1(AX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, R11
	MOVB R11, (AX)
	ADDQ $0x02, AX

match_nolit_emitcopy_end_encodeBetterBlockAsm4MB:
	CMPL CX, 8(SP)
	JAE  emit_remainder_encodeBetterBlockAsm4MB
	CMPQ AX, (SP)
	JB   match_nolit_dst_ok_encodeBetterBlockAsm4MB
	MOVQ $0x00000000, ret+48(FP)
	RET

match_nolit_dst_ok_encodeBetterBlockAsm4MB:
	MOVQ  $0x00cf1bbcdcbfa563, BX
	MOVQ  $0x9e3779b1, DI
	LEAQ  1(SI), SI
	LEAQ  -2(CX), R8
	MOVQ  (DX)(SI*1), R9
	MOVQ  1(DX)(SI*1), R10
	MOVQ  (DX)(R8*1), R11
	MOVQ  1(DX)(R8*1), R12
	SHLQ  $0x08, R9
	IMULQ BX, R9
	SHRQ  $0x2f, R9
	SHLQ  $0x20, R10
	IMULQ DI, R10
	SHRQ  $0x32, R10
	SHLQ  $0x08, R11
	IMULQ BX, R11
	SHRQ  $0x2f, R11
	SHLQ  $0x20, R12
	IMULQ DI, R12
	SHRQ  $0x32, R12
	LEAQ  1(SI), DI
	LEAQ  1(R8), R13
	MOVL  SI, 24(SP)(R9*4)
	MOVL  R8, 24(SP)(R11*4)
	MOVL  DI, 524312(SP)(R10*4)
	MOVL  R13, 524312(SP)(R12*4)
	LEAQ  1(R8)(SI*1), DI
	SHRQ  $0x01, DI
	ADDQ  $0x01, SI
	SUBQ  $0x01, R8

index_loop_encodeBetterBlockAsm4MB:
	CMPQ  DI, R8
	JAE   search_loop_encodeBetterBlockAsm4MB
	MOVQ  (DX)(SI*1), R9
	MOVQ  (DX)(DI*1), R10
	SHLQ  $0x08, R9
	IMULQ BX, R9
	SHRQ  $0x2f, R9
	SHLQ  $0x08, R10
	IMULQ BX, R10
	SHRQ  $0x2f, R10
	MOVL  SI, 24(SP)(R9*4)
	MOVL  DI, 24(SP)(R10*4)
	ADDQ  $0x02, SI
	ADDQ  $0x02, DI
	JMP   index_loop_encodeBetterBlockAsm4MB

emit_remainder_encodeBetterBlockAsm4MB:
	MOVQ src_len+32(FP), CX
	SUBL 12(SP), CX
	LEAQ 4(AX)(CX*1), CX
	CMPQ CX, (SP)
	JB   emit_remainder_ok_encodeBetterBlockAsm4MB
	MOVQ $0x00000000, ret+48(FP)
	RET

emit_remainder_ok_encodeBetterBlockAsm4MB:
	MOVQ src_len+32(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JB   one_byte_emit_remainder_encodeBetterBlockAsm4MB
	CMPL DX, $0x00000100
	JB   two_bytes_emit_remainder_encodeBetterBlockAsm4MB
	CMPL DX, $0x00010000
	JB   three_bytes_emit_remainder_encodeBetterBlockAsm4MB
	MOVL DX, BX
	SHRL $0x10, BX
	MOVB $0xf8, (AX)
	MOVW DX, 1(AX)
	MOVB BL, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm4MB

three_bytes_emit_remainder_encodeBetterBlockAsm4MB:
	MOVB $0xf4, (AX)
	MOVW DX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm4MB

two_bytes_emit_remainder_encodeBetterBlockAsm4MB:
	MOVB $0xf0, (AX)
	MOVB DL, 1(AX)
	ADDQ $0x02, AX
	CMPL DX, $0x40
	JB   memmove_emit_remainder_encodeBetterBlockAsm4MB
	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm4MB

one_byte_emit_remainder_encodeBetterBlockAsm4MB:
	SHLB $0x02, DL
	MOVB DL, (AX)
	ADDQ $0x01, AX

memmove_emit_remainder_encodeBetterBlockAsm4MB:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x03
	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2
	JE   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2:
	MOVB (CX), SI
	MOVB -1(CX)(BX*1), CL
	MOVB SI, (AX)
	MOVB CL, -1(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3:
	MOVW (CX), SI
	MOVB 2(CX), CL
	MOVW SI, (AX)
	MOVB CL, 2(AX)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7:
	MOVL (CX), SI
	MOVL -4(CX)(BX*1), CX
	MOVL SI, (AX)
	MOVL CX, -4(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(BX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(BX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)

memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB:
	MOVQ DX, AX
	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB

memmove_long_emit_remainder_encodeBetterBlockAsm4MB:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)
	MOVQ  DX, AX

emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ AX, ret+48(FP)
	RET

// func encodeBetterBlockAsm12B(dst []byte, src []byte) int
// Requires: BMI, SSE2
TEXT ·encodeBetterBlockAsm12B(SB), $81944-56
	MOVQ dst_base+0(FP), AX
	MOVQ $0x00000280, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_encodeBetterBlockAsm12B:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_encodeBetterBlockAsm12B
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), CX
	LEAQ  -6(CX), DX
	LEAQ  -8(CX), BX
	MOVL  BX, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  $0x00000000, 16(SP)
	MOVQ  src_base+24(FP), DX

search_loop_encodeBetterBlockAsm12B:
	MOVL  CX, BX
	SUBL  12(SP), BX
	SHRL  $0x06, BX
	LEAL  1(CX)(BX*1), BX
	CMPL  BX, 8(SP)
	JAE   emit_remainder_encodeBetterBlockAsm12B
	MOVQ  (DX)(CX*1), SI
	MOVL  BX, 20(SP)
	MOVQ  $0x0000cf1bbcdcbf9b, R8
	MOVQ  $0x9e3779b1, BX
	MOVQ  SI, R9
	MOVQ  SI, R10
	SHLQ  $0x10, R9
	IMULQ R8, R9
	SHRQ  $0x32, R9
	SHLQ  $0x20, R10
	IMULQ BX, R10
	SHRQ  $0x34, R10
	MOVL  24(SP)(R9*4), BX
	MOVL  65560(SP)(R10*4), DI
	MOVL  CX, 24(SP)(R9*4)
	MOVL  CX, 65560(SP)(R10*4)
	MOVQ  (DX)(BX*1), R9
	MOVQ  (DX)(DI*1), R10
	CMPQ  R9, SI
	JEQ   candidate_match_encodeBetterBlockAsm12B
	CMPQ  R10, SI
	JNE   no_short_found_encodeBetterBlockAsm12B
	MOVL  DI, BX
	JMP   candidate_match_encodeBetterBlockAsm12B

no_short_found_encodeBetterBlockAsm12B:
	CMPL R9, SI
	JEQ  candidate_match_encodeBetterBlockAsm12B
	CMPL R10, SI
	JEQ  candidateS_match_encodeBetterBlockAsm12B
	MOVL 20(SP), CX
	JMP  search_loop_encodeBetterBlockAsm12B

candidateS_match_encodeBetterBlockAsm12B:
	SHRQ  $0x08, SI
	MOVQ  SI, R9
	SHLQ  $0x10, R9
	IMULQ R8, R9
	SHRQ  $0x32, R9
	MOVL  24(SP)(R9*4), BX
	INCL  CX
	MOVL  CX, 24(SP)(R9*4)
	CMPL  (DX)(BX*1), SI
	JEQ   candidate_match_encodeBetterBlockAsm12B
	DECL  CX
	MOVL  DI, BX

candidate_match_encodeBetterBlockAsm12B:
	MOVL  12(SP), SI
	TESTL BX, BX
	JZ    match_extend_back_end_encodeBetterBlockAsm12B

match_extend_back_loop_encodeBetterBlockAsm12B:
	CMPL CX, SI
	JBE  match_extend_back_end_encodeBetterBlockAsm12B
	MOVB -1(DX)(BX*1), DI
	MOVB -1(DX)(CX*1), R8
	CMPB DI, R8
	JNE  match_extend_back_end_encodeBetterBlockAsm12B
	LEAL -1(CX), CX
	DECL BX
	JZ   match_extend_back_end_encodeBetterBlockAsm12B
	JMP  match_extend_back_loop_encodeBetterBlockAsm12B

match_extend_back_end_encodeBetterBlockAsm12B:
	MOVL CX, SI
	SUBL 12(SP), SI
	LEAQ 3(AX)(SI*1), SI
	CMPQ SI, (SP)
	JB   match_dst_size_check_encodeBetterBlockAsm12B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_dst_size_check_encodeBetterBlockAsm12B:
	MOVL CX, SI
	ADDL $0x04, CX
	ADDL $0x04, BX
	MOVQ src_len+32(FP), DI
	SUBL CX, DI
	LEAQ (DX)(CX*1), R8
	LEAQ (DX)(BX*1), R9

	// matchLen
	XORL R11, R11

matchlen_loopback_16_match_nolit_encodeBetterBlockAsm12B:
	CMPL DI, $0x10
	JB   matchlen_match8_match_nolit_encodeBetterBlockAsm12B
	MOVQ (R8)(R11*1), R10
	MOVQ 8(R8)(R11*1), R12
	XORQ (R9)(R11*1), R10
	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm12B
	XORQ 8(R9)(R11*1), R12
	JNZ  matchlen_bsf_16match_nolit_encodeBetterBlockAsm12B
	LEAL -16(DI), DI
	LEAL 16(R11), R11
	JMP  matchlen_loopback_16_match_nolit_encodeBetterBlockAsm12B

matchlen_bsf_16match_nolit_encodeBetterBlockAsm12B:
#ifdef GOAMD64_v3
	TZCNTQ R12, R12

#else
	BSFQ R12, R12

#endif
	SARQ $0x03, R12
	LEAL 8(R11)(R12*1), R11
	JMP  match_nolit_end_encodeBetterBlockAsm12B

matchlen_match8_match_nolit_encodeBetterBlockAsm12B:
	CMPL DI, $0x08
	JB   matchlen_match4_match_nolit_encodeBetterBlockAsm12B
	MOVQ (R8)(R11*1), R10
	XORQ (R9)(R11*1), R10
	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm12B
	LEAL -8(DI), DI
	LEAL 8(R11), R11
	JMP  matchlen_match4_match_nolit_encodeBetterBlockAsm12B

matchlen_bsf_8_match_nolit_encodeBetterBlockAsm12B:
#ifdef GOAMD64_v3
	TZCNTQ R10, R10

#else
	BSFQ R10, R10

#endif
	SARQ $0x03, R10
	LEAL (R11)(R10*1), R11
	JMP  match_nolit_end_encodeBetterBlockAsm12B

matchlen_match4_match_nolit_encodeBetterBlockAsm12B:
	CMPL DI, $0x04
	JB   matchlen_match2_match_nolit_encodeBetterBlockAsm12B
	MOVL (R8)(R11*1), R10
	CMPL (R9)(R11*1), R10
	JNE  matchlen_match2_match_nolit_encodeBetterBlockAsm12B
	LEAL -4(DI), DI
	LEAL 4(R11), R11

matchlen_match2_match_nolit_encodeBetterBlockAsm12B:
	CMPL DI, $0x01
	JE   matchlen_match1_match_nolit_encodeBetterBlockAsm12B
	JB   match_nolit_end_encodeBetterBlockAsm12B
	MOVW (R8)(R11*1), R10
	CMPW (R9)(R11*1), R10
	JNE  matchlen_match1_match_nolit_encodeBetterBlockAsm12B
	LEAL 2(R11), R11
	SUBL $0x02, DI
	JZ   match_nolit_end_encodeBetterBlockAsm12B

matchlen_match1_match_nolit_encodeBetterBlockAsm12B:
	MOVB (R8)(R11*1), R10
	CMPB (R9)(R11*1), R10
	JNE  match_nolit_end_encodeBetterBlockAsm12B
	LEAL 1(R11), R11

match_nolit_end_encodeBetterBlockAsm12B:
	MOVL CX, DI
	SUBL BX, DI

	// Check if repeat
	CMPL 16(SP), DI
	JEQ  match_is_repeat_encodeBetterBlockAsm12B
	MOVL DI, 16(SP)
	MOVL 12(SP), BX
	CMPL BX, SI
	JEQ  emit_literal_done_match_emit_encodeBetterBlockAsm12B
	MOVL SI, R8
	MOVL SI, 12(SP)
	LEAQ (DX)(BX*1), R9
	SUBL BX, R8
	LEAL -1(R8), BX
	CMPL BX, $0x3c
	JB   one_byte_match_emit_encodeBetterBlockAsm12B
	CMPL BX, $0x00000100
	JB   two_bytes_match_emit_encodeBetterBlockAsm12B
	JB   three_bytes_match_emit_encodeBetterBlockAsm12B

three_bytes_match_emit_encodeBetterBlockAsm12B:
	MOVB $0xf4, (AX)
	MOVW BX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_encodeBetterBlockAsm12B

two_bytes_match_emit_encodeBetterBlockAsm12B:
	MOVB $0xf0, (AX)
	MOVB BL, 1(AX)
	ADDQ $0x02, AX
	CMPL BX, $0x40
	JB   memmove_match_emit_encodeBetterBlockAsm12B
	JMP  memmove_long_match_emit_encodeBetterBlockAsm12B

one_byte_match_emit_encodeBetterBlockAsm12B:
	SHLB $0x02, BL
	MOVB BL, (AX)
	ADDQ $0x01, AX

memmove_match_emit_encodeBetterBlockAsm12B:
	LEAQ (AX)(R8*1), BX

	// genMemMoveShort
	CMPQ R8, $0x04
	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4
	CMPQ R8, $0x08
	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64

emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4:
	MOVL (R9), R10
	MOVL R10, (AX)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm12B

emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7:
	MOVL (R9), R10
	MOVL -4(R9)(R8*1), R9
	MOVL R10, (AX)
	MOVL R9, -4(AX)(R8*1)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm12B

emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16:
	MOVQ (R9), R10
	MOVQ -8(R9)(R8*1), R9
	MOVQ R10, (AX)
	MOVQ R9, -8(AX)(R8*1)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm12B

emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32:
	MOVOU (R9), X0
	MOVOU -16(R9)(R8*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R8*1)
	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm12B

emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64:
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)

memmove_end_copy_match_emit_encodeBetterBlockAsm12B:
	MOVQ BX, AX
	JMP  emit_literal_done_match_emit_encodeBetterBlockAsm12B

memmove_long_match_emit_encodeBetterBlockAsm12B:
	LEAQ (AX)(R8*1), BX

	// genMemMoveLong
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVQ  R8, R12
	SHRQ  $0x05, R12
	MOVQ  AX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R13
	SUBQ  R10, R13
	DECQ  R12
	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
	LEAQ  -32(R9)(R13*1), R10
	LEAQ  -32(AX)(R13*1), R14

emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R14)
	MOVOA X5, 16(R14)
	ADDQ  $0x20, R14
	ADDQ  $0x20, R10
	ADDQ  $0x20, R13
	DECQ  R12
	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
	MOVOU -32(R9)(R13*1), X4
	MOVOU -16(R9)(R13*1), X5
	MOVOA X4, -32(AX)(R13*1)
	MOVOA X5, -16(AX)(R13*1)
	ADDQ  $0x20, R13
	CMPQ  R8, R13
	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)
	MOVQ  BX, AX

emit_literal_done_match_emit_encodeBetterBlockAsm12B:
	ADDL R11, CX
	ADDL $0x04, R11
	MOVL CX, 12(SP)

	// emitCopy
	CMPL R11, $0x40
	JBE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B
	CMPL DI, $0x00000800
	JAE  long_offset_short_match_nolit_encodeBetterBlockAsm12B
	MOVL $0x00000001, BX
	LEAL 16(BX), BX
	MOVB DI, 1(AX)
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, BX
	MOVB BL, (AX)
	ADDQ $0x02, AX
	SUBL $0x08, R11

	// emitRepeat
	LEAL -4(R11), R11
	JMP  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
	MOVL R11, BX
	LEAL -4(R11), R11
	CMPL BX, $0x08
	JBE  repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
	CMPL BX, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
	CMPL DI, $0x00000800
	JB   repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b

cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
	CMPL R11, $0x00000104
	JB   repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
	LEAL -256(R11), R11
	MOVW $0x0019, (AX)
	MOVW R11, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B

repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
	LEAL -4(R11), R11
	MOVW $0x0015, (AX)
	MOVB R11, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B

repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
	SHLL $0x02, R11
	ORL  $0x01, R11
	MOVW R11, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B

repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
	XORQ BX, BX
	LEAL 1(BX)(R11*4), R11
	MOVB DI, 1(AX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, R11
	MOVB R11, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B

long_offset_short_match_nolit_encodeBetterBlockAsm12B:
	MOVB $0xee, (AX)
	MOVW DI, 1(AX)
	LEAL -60(R11), R11
	ADDQ $0x03, AX

	// emitRepeat
	MOVL R11, BX
	LEAL -4(R11), R11
	CMPL BX, $0x08
	JBE  repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
	CMPL BX, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
	CMPL DI, $0x00000800
	JB   repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short

cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
	CMPL R11, $0x00000104
	JB   repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
	LEAL -256(R11), R11
	MOVW $0x0019, (AX)
	MOVW R11, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B

repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
	LEAL -4(R11), R11
	MOVW $0x0015, (AX)
	MOVB R11, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B

repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
	SHLL $0x02, R11
	ORL  $0x01, R11
	MOVW R11, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B

repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
	XORQ BX, BX
	LEAL 1(BX)(R11*4), R11
	MOVB DI, 1(AX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, R11
	MOVB R11, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B

two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B:
	MOVL R11, BX
	SHLL $0x02, BX
	CMPL R11, $0x0c
	JAE  emit_copy_three_match_nolit_encodeBetterBlockAsm12B
	CMPL DI, $0x00000800
	JAE  emit_copy_three_match_nolit_encodeBetterBlockAsm12B
	LEAL -15(BX), BX
	MOVB DI, 1(AX)
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, BX
	MOVB BL, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B

emit_copy_three_match_nolit_encodeBetterBlockAsm12B:
	LEAL -2(BX), BX
	MOVB BL, (AX)
	MOVW DI, 1(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B

match_is_repeat_encodeBetterBlockAsm12B:
	MOVL 12(SP), BX
	CMPL BX, SI
	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B
	MOVL SI, R8
	MOVL SI, 12(SP)
	LEAQ (DX)(BX*1), R9
	SUBL BX, R8
	LEAL -1(R8), BX
	CMPL BX, $0x3c
	JB   one_byte_match_emit_repeat_encodeBetterBlockAsm12B
	CMPL BX, $0x00000100
	JB   two_bytes_match_emit_repeat_encodeBetterBlockAsm12B
	JB   three_bytes_match_emit_repeat_encodeBetterBlockAsm12B

three_bytes_match_emit_repeat_encodeBetterBlockAsm12B:
	MOVB $0xf4, (AX)
	MOVW BX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm12B

two_bytes_match_emit_repeat_encodeBetterBlockAsm12B:
	MOVB $0xf0, (AX)
	MOVB BL, 1(AX)
	ADDQ $0x02, AX
	CMPL BX, $0x40
	JB   memmove_match_emit_repeat_encodeBetterBlockAsm12B
	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm12B

one_byte_match_emit_repeat_encodeBetterBlockAsm12B:
	SHLB $0x02, BL
	MOVB BL, (AX)
	ADDQ $0x01, AX

memmove_match_emit_repeat_encodeBetterBlockAsm12B:
	LEAQ (AX)(R8*1), BX

	// genMemMoveShort
	CMPQ R8, $0x04
	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4
	CMPQ R8, $0x08
	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4:
	MOVL (R9), R10
	MOVL R10, (AX)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7:
	MOVL (R9), R10
	MOVL -4(R9)(R8*1), R9
	MOVL R10, (AX)
	MOVL R9, -4(AX)(R8*1)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16:
	MOVQ (R9), R10
	MOVQ -8(R9)(R8*1), R9
	MOVQ R10, (AX)
	MOVQ R9, -8(AX)(R8*1)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32:
	MOVOU (R9), X0
	MOVOU -16(R9)(R8*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R8*1)
	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64:
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)

memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B:
	MOVQ BX, AX
	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B

memmove_long_match_emit_repeat_encodeBetterBlockAsm12B:
	LEAQ (AX)(R8*1), BX

	// genMemMoveLong
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVQ  R8, R12
	SHRQ  $0x05, R12
	MOVQ  AX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R13
	SUBQ  R10, R13
	DECQ  R12
	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
	LEAQ  -32(R9)(R13*1), R10
	LEAQ  -32(AX)(R13*1), R14

emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R14)
	MOVOA X5, 16(R14)
	ADDQ  $0x20, R14
	ADDQ  $0x20, R10
	ADDQ  $0x20, R13
	DECQ  R12
	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back

emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
	MOVOU -32(R9)(R13*1), X4
	MOVOU -16(R9)(R13*1), X5
	MOVOA X4, -32(AX)(R13*1)
	MOVOA X5, -16(AX)(R13*1)
	ADDQ  $0x20, R13
	CMPQ  R8, R13
	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)
	MOVQ  BX, AX

emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B:
	ADDL R11, CX
	ADDL $0x04, R11
	MOVL CX, 12(SP)

	// emitRepeat
	MOVL R11, BX
	LEAL -4(R11), R11
	CMPL BX, $0x08
	JBE  repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B
	CMPL BX, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B
	CMPL DI, $0x00000800
	JB   repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B

cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B:
	CMPL R11, $0x00000104
	JB   repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B
	LEAL -256(R11), R11
	MOVW $0x0019, (AX)
	MOVW R11, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B

repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B:
	LEAL -4(R11), R11
	MOVW $0x0015, (AX)
	MOVB R11, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B

repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B:
	SHLL $0x02, R11
	ORL  $0x01, R11
	MOVW R11, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B

repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B:
	XORQ BX, BX
	LEAL 1(BX)(R11*4), R11
	MOVB DI, 1(AX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, R11
	MOVB R11, (AX)
	ADDQ $0x02, AX

match_nolit_emitcopy_end_encodeBetterBlockAsm12B:
	CMPL CX, 8(SP)
	JAE  emit_remainder_encodeBetterBlockAsm12B
	CMPQ AX, (SP)
	JB   match_nolit_dst_ok_encodeBetterBlockAsm12B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_nolit_dst_ok_encodeBetterBlockAsm12B:
	MOVQ  $0x0000cf1bbcdcbf9b, BX
	MOVQ  $0x9e3779b1, DI
	LEAQ  1(SI), SI
	LEAQ  -2(CX), R8
	MOVQ  (DX)(SI*1), R9
	MOVQ  1(DX)(SI*1), R10
	MOVQ  (DX)(R8*1), R11
	MOVQ  1(DX)(R8*1), R12
	SHLQ  $0x10, R9
	IMULQ BX, R9
	SHRQ  $0x32, R9
	SHLQ  $0x20, R10
	IMULQ DI, R10
	SHRQ  $0x34, R10
	SHLQ  $0x10, R11
	IMULQ BX, R11
	SHRQ  $0x32, R11
	SHLQ  $0x20, R12
	IMULQ DI, R12
	SHRQ  $0x34, R12
	LEAQ  1(SI), DI
	LEAQ  1(R8), R13
	MOVL  SI, 24(SP)(R9*4)
	MOVL  R8, 24(SP)(R11*4)
	MOVL  DI, 65560(SP)(R10*4)
	MOVL  R13, 65560(SP)(R12*4)
	LEAQ  1(R8)(SI*1), DI
	SHRQ  $0x01, DI
	ADDQ  $0x01, SI
	SUBQ  $0x01, R8

index_loop_encodeBetterBlockAsm12B:
	CMPQ  DI, R8
	JAE   search_loop_encodeBetterBlockAsm12B
	MOVQ  (DX)(SI*1), R9
	MOVQ  (DX)(DI*1), R10
	SHLQ  $0x10, R9
	IMULQ BX, R9
	SHRQ  $0x32, R9
	SHLQ  $0x10, R10
	IMULQ BX, R10
	SHRQ  $0x32, R10
	MOVL  SI, 24(SP)(R9*4)
	MOVL  DI, 24(SP)(R10*4)
	ADDQ  $0x02, SI
	ADDQ  $0x02, DI
	JMP   index_loop_encodeBetterBlockAsm12B

emit_remainder_encodeBetterBlockAsm12B:
	MOVQ src_len+32(FP), CX
	SUBL 12(SP), CX
	LEAQ 3(AX)(CX*1), CX
	CMPQ CX, (SP)
	JB   emit_remainder_ok_encodeBetterBlockAsm12B
	MOVQ $0x00000000, ret+48(FP)
	RET

emit_remainder_ok_encodeBetterBlockAsm12B:
	MOVQ src_len+32(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm12B
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JB   one_byte_emit_remainder_encodeBetterBlockAsm12B
	CMPL DX, $0x00000100
	JB   two_bytes_emit_remainder_encodeBetterBlockAsm12B
	JB   three_bytes_emit_remainder_encodeBetterBlockAsm12B

three_bytes_emit_remainder_encodeBetterBlockAsm12B:
	MOVB $0xf4, (AX)
	MOVW DX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm12B

two_bytes_emit_remainder_encodeBetterBlockAsm12B:
	MOVB $0xf0, (AX)
	MOVB DL, 1(AX)
	ADDQ $0x02, AX
	CMPL DX, $0x40
	JB   memmove_emit_remainder_encodeBetterBlockAsm12B
	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm12B

one_byte_emit_remainder_encodeBetterBlockAsm12B:
	SHLB $0x02, DL
	MOVB DL, (AX)
	ADDQ $0x01, AX

memmove_emit_remainder_encodeBetterBlockAsm12B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x03
	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2
	JE   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2:
	MOVB (CX), SI
	MOVB -1(CX)(BX*1), CL
	MOVB SI, (AX)
	MOVB CL, -1(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3:
	MOVW (CX), SI
	MOVB 2(CX), CL
	MOVW SI, (AX)
	MOVB CL, 2(AX)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7:
	MOVL (CX), SI
	MOVL -4(CX)(BX*1), CX
	MOVL SI, (AX)
	MOVL CX, -4(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(BX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(BX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)

memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B:
	MOVQ DX, AX
	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm12B

memmove_long_emit_remainder_encodeBetterBlockAsm12B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)
	MOVQ  DX, AX

emit_literal_done_emit_remainder_encodeBetterBlockAsm12B:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ AX, ret+48(FP)
	RET

// func encodeBetterBlockAsm10B(dst []byte, src []byte) int
// Requires: BMI, SSE2
TEXT ·encodeBetterBlockAsm10B(SB), $20504-56
	MOVQ dst_base+0(FP), AX
	MOVQ $0x000000a0, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_encodeBetterBlockAsm10B:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_encodeBetterBlockAsm10B
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), CX
	LEAQ  -6(CX), DX
	LEAQ  -8(CX), BX
	MOVL  BX, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  $0x00000000, 16(SP)
	MOVQ  src_base+24(FP), DX

search_loop_encodeBetterBlockAsm10B:
	MOVL  CX, BX
	SUBL  12(SP), BX
	SHRL  $0x05, BX
	LEAL  1(CX)(BX*1), BX
	CMPL  BX, 8(SP)
	JAE   emit_remainder_encodeBetterBlockAsm10B
	MOVQ  (DX)(CX*1), SI
	MOVL  BX, 20(SP)
	MOVQ  $0x0000cf1bbcdcbf9b, R8
	MOVQ  $0x9e3779b1, BX
	MOVQ  SI, R9
	MOVQ  SI, R10
	SHLQ  $0x10, R9
	IMULQ R8, R9
	SHRQ  $0x34, R9
	SHLQ  $0x20, R10
	IMULQ BX, R10
	SHRQ  $0x36, R10
	MOVL  24(SP)(R9*4), BX
	MOVL  16408(SP)(R10*4), DI
	MOVL  CX, 24(SP)(R9*4)
	MOVL  CX, 16408(SP)(R10*4)
	MOVQ  (DX)(BX*1), R9
	MOVQ  (DX)(DI*1), R10
	CMPQ  R9, SI
	JEQ   candidate_match_encodeBetterBlockAsm10B
	CMPQ  R10, SI
	JNE   no_short_found_encodeBetterBlockAsm10B
	MOVL  DI, BX
	JMP   candidate_match_encodeBetterBlockAsm10B

no_short_found_encodeBetterBlockAsm10B:
	CMPL R9, SI
	JEQ  candidate_match_encodeBetterBlockAsm10B
	CMPL R10, SI
	JEQ  candidateS_match_encodeBetterBlockAsm10B
	MOVL 20(SP), CX
	JMP  search_loop_encodeBetterBlockAsm10B

candidateS_match_encodeBetterBlockAsm10B:
	SHRQ  $0x08, SI
	MOVQ  SI, R9
	SHLQ  $0x10, R9
	IMULQ R8, R9
	SHRQ  $0x34, R9
	MOVL  24(SP)(R9*4), BX
	INCL  CX
	MOVL  CX, 24(SP)(R9*4)
	CMPL  (DX)(BX*1), SI
	JEQ   candidate_match_encodeBetterBlockAsm10B
	DECL  CX
	MOVL  DI, BX

candidate_match_encodeBetterBlockAsm10B:
	MOVL  12(SP), SI
	TESTL BX, BX
	JZ    match_extend_back_end_encodeBetterBlockAsm10B

match_extend_back_loop_encodeBetterBlockAsm10B:
	CMPL CX, SI
	JBE  match_extend_back_end_encodeBetterBlockAsm10B
	MOVB -1(DX)(BX*1), DI
	MOVB -1(DX)(CX*1), R8
	CMPB DI, R8
	JNE  match_extend_back_end_encodeBetterBlockAsm10B
	LEAL -1(CX), CX
	DECL BX
	JZ   match_extend_back_end_encodeBetterBlockAsm10B
	JMP  match_extend_back_loop_encodeBetterBlockAsm10B

match_extend_back_end_encodeBetterBlockAsm10B:
	MOVL CX, SI
	SUBL 12(SP), SI
	LEAQ 3(AX)(SI*1), SI
	CMPQ SI, (SP)
	JB   match_dst_size_check_encodeBetterBlockAsm10B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_dst_size_check_encodeBetterBlockAsm10B:
	MOVL CX, SI
	ADDL $0x04, CX
	ADDL $0x04, BX
	MOVQ src_len+32(FP), DI
	SUBL CX, DI
	LEAQ (DX)(CX*1), R8
	LEAQ (DX)(BX*1), R9

	// matchLen
	XORL R11, R11

matchlen_loopback_16_match_nolit_encodeBetterBlockAsm10B:
	CMPL DI, $0x10
	JB   matchlen_match8_match_nolit_encodeBetterBlockAsm10B
	MOVQ (R8)(R11*1), R10
	MOVQ 8(R8)(R11*1), R12
	XORQ (R9)(R11*1), R10
	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm10B
	XORQ 8(R9)(R11*1), R12
	JNZ  matchlen_bsf_16match_nolit_encodeBetterBlockAsm10B
	LEAL -16(DI), DI
	LEAL 16(R11), R11
	JMP  matchlen_loopback_16_match_nolit_encodeBetterBlockAsm10B

matchlen_bsf_16match_nolit_encodeBetterBlockAsm10B:
#ifdef GOAMD64_v3
	TZCNTQ R12, R12

#else
	BSFQ R12, R12

#endif
	SARQ $0x03, R12
	LEAL 8(R11)(R12*1), R11
	JMP  match_nolit_end_encodeBetterBlockAsm10B

matchlen_match8_match_nolit_encodeBetterBlockAsm10B:
	CMPL DI, $0x08
	JB   matchlen_match4_match_nolit_encodeBetterBlockAsm10B
	MOVQ (R8)(R11*1), R10
	XORQ (R9)(R11*1), R10
	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm10B
	LEAL -8(DI), DI
	LEAL 8(R11), R11
	JMP  matchlen_match4_match_nolit_encodeBetterBlockAsm10B

matchlen_bsf_8_match_nolit_encodeBetterBlockAsm10B:
#ifdef GOAMD64_v3
	TZCNTQ R10, R10

#else
	BSFQ R10, R10

#endif
	SARQ $0x03, R10
	LEAL (R11)(R10*1), R11
	JMP  match_nolit_end_encodeBetterBlockAsm10B

matchlen_match4_match_nolit_encodeBetterBlockAsm10B:
	CMPL DI, $0x04
	JB   matchlen_match2_match_nolit_encodeBetterBlockAsm10B
	MOVL (R8)(R11*1), R10
	CMPL (R9)(R11*1), R10
	JNE  matchlen_match2_match_nolit_encodeBetterBlockAsm10B
	LEAL -4(DI), DI
	LEAL 4(R11), R11

matchlen_match2_match_nolit_encodeBetterBlockAsm10B:
	CMPL DI, $0x01
	JE   matchlen_match1_match_nolit_encodeBetterBlockAsm10B
	JB   match_nolit_end_encodeBetterBlockAsm10B
	MOVW (R8)(R11*1), R10
	CMPW (R9)(R11*1), R10
	JNE  matchlen_match1_match_nolit_encodeBetterBlockAsm10B
	LEAL 2(R11), R11
	SUBL $0x02, DI
	JZ   match_nolit_end_encodeBetterBlockAsm10B

matchlen_match1_match_nolit_encodeBetterBlockAsm10B:
	MOVB (R8)(R11*1), R10
	CMPB (R9)(R11*1), R10
	JNE  match_nolit_end_encodeBetterBlockAsm10B
	LEAL 1(R11), R11

match_nolit_end_encodeBetterBlockAsm10B:
	MOVL CX, DI
	SUBL BX, DI

	// Check if repeat
	CMPL 16(SP), DI
	JEQ  match_is_repeat_encodeBetterBlockAsm10B
	MOVL DI, 16(SP)
	MOVL 12(SP), BX
	CMPL BX, SI
	JEQ  emit_literal_done_match_emit_encodeBetterBlockAsm10B
	MOVL SI, R8
	MOVL SI, 12(SP)
	LEAQ (DX)(BX*1), R9
	SUBL BX, R8
	LEAL -1(R8), BX
	CMPL BX, $0x3c
	JB   one_byte_match_emit_encodeBetterBlockAsm10B
	CMPL BX, $0x00000100
	JB   two_bytes_match_emit_encodeBetterBlockAsm10B
	JB   three_bytes_match_emit_encodeBetterBlockAsm10B

three_bytes_match_emit_encodeBetterBlockAsm10B:
	MOVB $0xf4, (AX)
	MOVW BX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_encodeBetterBlockAsm10B

two_bytes_match_emit_encodeBetterBlockAsm10B:
	MOVB $0xf0, (AX)
	MOVB BL, 1(AX)
	ADDQ $0x02, AX
	CMPL BX, $0x40
	JB   memmove_match_emit_encodeBetterBlockAsm10B
	JMP  memmove_long_match_emit_encodeBetterBlockAsm10B

one_byte_match_emit_encodeBetterBlockAsm10B:
	SHLB $0x02, BL
	MOVB BL, (AX)
	ADDQ $0x01, AX

memmove_match_emit_encodeBetterBlockAsm10B:
	LEAQ (AX)(R8*1), BX

	// genMemMoveShort
	CMPQ R8, $0x04
	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4
	CMPQ R8, $0x08
	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64

emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4:
	MOVL (R9), R10
	MOVL R10, (AX)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm10B

emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7:
	MOVL (R9), R10
	MOVL -4(R9)(R8*1), R9
	MOVL R10, (AX)
	MOVL R9, -4(AX)(R8*1)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm10B

emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16:
	MOVQ (R9), R10
	MOVQ -8(R9)(R8*1), R9
	MOVQ R10, (AX)
	MOVQ R9, -8(AX)(R8*1)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm10B

emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32:
	MOVOU (R9), X0
	MOVOU -16(R9)(R8*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R8*1)
	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm10B

emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64:
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)

memmove_end_copy_match_emit_encodeBetterBlockAsm10B:
	MOVQ BX, AX
	JMP  emit_literal_done_match_emit_encodeBetterBlockAsm10B

memmove_long_match_emit_encodeBetterBlockAsm10B:
	LEAQ (AX)(R8*1), BX

	// genMemMoveLong
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVQ  R8, R12
	SHRQ  $0x05, R12
	MOVQ  AX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R13
	SUBQ  R10, R13
	DECQ  R12
	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
	LEAQ  -32(R9)(R13*1), R10
	LEAQ  -32(AX)(R13*1), R14

emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R14)
	MOVOA X5, 16(R14)
	ADDQ  $0x20, R14
	ADDQ  $0x20, R10
	ADDQ  $0x20, R13
	DECQ  R12
	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
	MOVOU -32(R9)(R13*1), X4
	MOVOU -16(R9)(R13*1), X5
	MOVOA X4, -32(AX)(R13*1)
	MOVOA X5, -16(AX)(R13*1)
	ADDQ  $0x20, R13
	CMPQ  R8, R13
	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)
	MOVQ  BX, AX

emit_literal_done_match_emit_encodeBetterBlockAsm10B:
	ADDL R11, CX
	ADDL $0x04, R11
	MOVL CX, 12(SP)

	// emitCopy
	CMPL R11, $0x40
	JBE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B
	CMPL DI, $0x00000800
	JAE  long_offset_short_match_nolit_encodeBetterBlockAsm10B
	MOVL $0x00000001, BX
	LEAL 16(BX), BX
	MOVB DI, 1(AX)
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, BX
	MOVB BL, (AX)
	ADDQ $0x02, AX
	SUBL $0x08, R11

	// emitRepeat
	LEAL -4(R11), R11
	JMP  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
	MOVL R11, BX
	LEAL -4(R11), R11
	CMPL BX, $0x08
	JBE  repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
	CMPL BX, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
	CMPL DI, $0x00000800
	JB   repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b

cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
	CMPL R11, $0x00000104
	JB   repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
	LEAL -256(R11), R11
	MOVW $0x0019, (AX)
	MOVW R11, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B

repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
	LEAL -4(R11), R11
	MOVW $0x0015, (AX)
	MOVB R11, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B

repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
	SHLL $0x02, R11
	ORL  $0x01, R11
	MOVW R11, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B

repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
	XORQ BX, BX
	LEAL 1(BX)(R11*4), R11
	MOVB DI, 1(AX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, R11
	MOVB R11, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B

long_offset_short_match_nolit_encodeBetterBlockAsm10B:
	MOVB $0xee, (AX)
	MOVW DI, 1(AX)
	LEAL -60(R11), R11
	ADDQ $0x03, AX

	// emitRepeat
	MOVL R11, BX
	LEAL -4(R11), R11
	CMPL BX, $0x08
	JBE  repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
	CMPL BX, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
	CMPL DI, $0x00000800
	JB   repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short

cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
	CMPL R11, $0x00000104
	JB   repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
	LEAL -256(R11), R11
	MOVW $0x0019, (AX)
	MOVW R11, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B

repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
	LEAL -4(R11), R11
	MOVW $0x0015, (AX)
	MOVB R11, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B

repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
	SHLL $0x02, R11
	ORL  $0x01, R11
	MOVW R11, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B

repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
	XORQ BX, BX
	LEAL 1(BX)(R11*4), R11
	MOVB DI, 1(AX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, R11
	MOVB R11, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B

two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B:
	MOVL R11, BX
	SHLL $0x02, BX
	CMPL R11, $0x0c
	JAE  emit_copy_three_match_nolit_encodeBetterBlockAsm10B
	CMPL DI, $0x00000800
	JAE  emit_copy_three_match_nolit_encodeBetterBlockAsm10B
	LEAL -15(BX), BX
	MOVB DI, 1(AX)
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, BX
	MOVB BL, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B

emit_copy_three_match_nolit_encodeBetterBlockAsm10B:
	LEAL -2(BX), BX
	MOVB BL, (AX)
	MOVW DI, 1(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B

match_is_repeat_encodeBetterBlockAsm10B:
	MOVL 12(SP), BX
	CMPL BX, SI
	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B
	MOVL SI, R8
	MOVL SI, 12(SP)
	LEAQ (DX)(BX*1), R9
	SUBL BX, R8
	LEAL -1(R8), BX
	CMPL BX, $0x3c
	JB   one_byte_match_emit_repeat_encodeBetterBlockAsm10B
	CMPL BX, $0x00000100
	JB   two_bytes_match_emit_repeat_encodeBetterBlockAsm10B
	JB   three_bytes_match_emit_repeat_encodeBetterBlockAsm10B

three_bytes_match_emit_repeat_encodeBetterBlockAsm10B:
	MOVB $0xf4, (AX)
	MOVW BX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm10B

two_bytes_match_emit_repeat_encodeBetterBlockAsm10B:
	MOVB $0xf0, (AX)
	MOVB BL, 1(AX)
	ADDQ $0x02, AX
	CMPL BX, $0x40
	JB   memmove_match_emit_repeat_encodeBetterBlockAsm10B
	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm10B

one_byte_match_emit_repeat_encodeBetterBlockAsm10B:
	SHLB $0x02, BL
	MOVB BL, (AX)
	ADDQ $0x01, AX

memmove_match_emit_repeat_encodeBetterBlockAsm10B:
	LEAQ (AX)(R8*1), BX

	// genMemMoveShort
	CMPQ R8, $0x04
	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4
	CMPQ R8, $0x08
	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4:
	MOVL (R9), R10
	MOVL R10, (AX)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7:
	MOVL (R9), R10
	MOVL -4(R9)(R8*1), R9
	MOVL R10, (AX)
	MOVL R9, -4(AX)(R8*1)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16:
	MOVQ (R9), R10
	MOVQ -8(R9)(R8*1), R9
	MOVQ R10, (AX)
	MOVQ R9, -8(AX)(R8*1)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32:
	MOVOU (R9), X0
	MOVOU -16(R9)(R8*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R8*1)
	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64:
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)

memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B:
	MOVQ BX, AX
	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B

memmove_long_match_emit_repeat_encodeBetterBlockAsm10B:
	LEAQ (AX)(R8*1), BX

	// genMemMoveLong
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVQ  R8, R12
	SHRQ  $0x05, R12
	MOVQ  AX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R13
	SUBQ  R10, R13
	DECQ  R12
	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
	LEAQ  -32(R9)(R13*1), R10
	LEAQ  -32(AX)(R13*1), R14

emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R14)
	MOVOA X5, 16(R14)
	ADDQ  $0x20, R14
	ADDQ  $0x20, R10
	ADDQ  $0x20, R13
	DECQ  R12
	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back

emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
	MOVOU -32(R9)(R13*1), X4
	MOVOU -16(R9)(R13*1), X5
	MOVOA X4, -32(AX)(R13*1)
	MOVOA X5, -16(AX)(R13*1)
	ADDQ  $0x20, R13
	CMPQ  R8, R13
	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)
	MOVQ  BX, AX

emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B:
	ADDL R11, CX
	ADDL $0x04, R11
	MOVL CX, 12(SP)

	// emitRepeat
	MOVL R11, BX
	LEAL -4(R11), R11
	CMPL BX, $0x08
	JBE  repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B
	CMPL BX, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B
	CMPL DI, $0x00000800
	JB   repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B

cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B:
	CMPL R11, $0x00000104
	JB   repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B
	LEAL -256(R11), R11
	MOVW $0x0019, (AX)
	MOVW R11, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B

repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B:
	LEAL -4(R11), R11
	MOVW $0x0015, (AX)
	MOVB R11, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B

repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B:
	SHLL $0x02, R11
	ORL  $0x01, R11
	MOVW R11, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B

repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B:
	XORQ BX, BX
	LEAL 1(BX)(R11*4), R11
	MOVB DI, 1(AX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, R11
	MOVB R11, (AX)
	ADDQ $0x02, AX

match_nolit_emitcopy_end_encodeBetterBlockAsm10B:
	CMPL CX, 8(SP)
	JAE  emit_remainder_encodeBetterBlockAsm10B
	CMPQ AX, (SP)
	JB   match_nolit_dst_ok_encodeBetterBlockAsm10B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_nolit_dst_ok_encodeBetterBlockAsm10B:
	MOVQ  $0x0000cf1bbcdcbf9b, BX
	MOVQ  $0x9e3779b1, DI
	LEAQ  1(SI), SI
	LEAQ  -2(CX), R8
	MOVQ  (DX)(SI*1), R9
	MOVQ  1(DX)(SI*1), R10
	MOVQ  (DX)(R8*1), R11
	MOVQ  1(DX)(R8*1), R12
	SHLQ  $0x10, R9
	IMULQ BX, R9
	SHRQ  $0x34, R9
	SHLQ  $0x20, R10
	IMULQ DI, R10
	SHRQ  $0x36, R10
	SHLQ  $0x10, R11
	IMULQ BX, R11
	SHRQ  $0x34, R11
	SHLQ  $0x20, R12
	IMULQ DI, R12
	SHRQ  $0x36, R12
	LEAQ  1(SI), DI
	LEAQ  1(R8), R13
	MOVL  SI, 24(SP)(R9*4)
	MOVL  R8, 24(SP)(R11*4)
	MOVL  DI, 16408(SP)(R10*4)
	MOVL  R13, 16408(SP)(R12*4)
	LEAQ  1(R8)(SI*1), DI
	SHRQ  $0x01, DI
	ADDQ  $0x01, SI
	SUBQ  $0x01, R8

index_loop_encodeBetterBlockAsm10B:
	CMPQ  DI, R8
	JAE   search_loop_encodeBetterBlockAsm10B
	MOVQ  (DX)(SI*1), R9
	MOVQ  (DX)(DI*1), R10
	SHLQ  $0x10, R9
	IMULQ BX, R9
	SHRQ  $0x34, R9
	SHLQ  $0x10, R10
	IMULQ BX, R10
	SHRQ  $0x34, R10
	MOVL  SI, 24(SP)(R9*4)
	MOVL  DI, 24(SP)(R10*4)
	ADDQ  $0x02, SI
	ADDQ  $0x02, DI
	JMP   index_loop_encodeBetterBlockAsm10B

emit_remainder_encodeBetterBlockAsm10B:
	MOVQ src_len+32(FP), CX
	SUBL 12(SP), CX
	LEAQ 3(AX)(CX*1), CX
	CMPQ CX, (SP)
	JB   emit_remainder_ok_encodeBetterBlockAsm10B
	MOVQ $0x00000000, ret+48(FP)
	RET

emit_remainder_ok_encodeBetterBlockAsm10B:
	MOVQ src_len+32(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm10B
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JB   one_byte_emit_remainder_encodeBetterBlockAsm10B
	CMPL DX, $0x00000100
	JB   two_bytes_emit_remainder_encodeBetterBlockAsm10B
	JB   three_bytes_emit_remainder_encodeBetterBlockAsm10B

three_bytes_emit_remainder_encodeBetterBlockAsm10B:
	MOVB $0xf4, (AX)
	MOVW DX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm10B

two_bytes_emit_remainder_encodeBetterBlockAsm10B:
	MOVB $0xf0, (AX)
	MOVB DL, 1(AX)
	ADDQ $0x02, AX
	CMPL DX, $0x40
	JB   memmove_emit_remainder_encodeBetterBlockAsm10B
	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm10B

one_byte_emit_remainder_encodeBetterBlockAsm10B:
	SHLB $0x02, DL
	MOVB DL, (AX)
	ADDQ $0x01, AX

memmove_emit_remainder_encodeBetterBlockAsm10B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x03
	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2
	JE   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2:
	MOVB (CX), SI
	MOVB -1(CX)(BX*1), CL
	MOVB SI, (AX)
	MOVB CL, -1(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3:
	MOVW (CX), SI
	MOVB 2(CX), CL
	MOVW SI, (AX)
	MOVB CL, 2(AX)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7:
	MOVL (CX), SI
	MOVL -4(CX)(BX*1), CX
	MOVL SI, (AX)
	MOVL CX, -4(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(BX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(BX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)

memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B:
	MOVQ DX, AX
	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm10B

memmove_long_emit_remainder_encodeBetterBlockAsm10B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)
	MOVQ  DX, AX

emit_literal_done_emit_remainder_encodeBetterBlockAsm10B:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ AX, ret+48(FP)
	RET

// func encodeBetterBlockAsm8B(dst []byte, src []byte) int
// Requires: BMI, SSE2
TEXT ·encodeBetterBlockAsm8B(SB), $5144-56
	MOVQ dst_base+0(FP), AX
	MOVQ $0x00000028, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_encodeBetterBlockAsm8B:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_encodeBetterBlockAsm8B
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), CX
	LEAQ  -6(CX), DX
	LEAQ  -8(CX), BX
	MOVL  BX, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  $0x00000000, 16(SP)
	MOVQ  src_base+24(FP), DX

search_loop_encodeBetterBlockAsm8B:
	MOVL  CX, BX
	SUBL  12(SP), BX
	SHRL  $0x04, BX
	LEAL  1(CX)(BX*1), BX
	CMPL  BX, 8(SP)
	JAE   emit_remainder_encodeBetterBlockAsm8B
	MOVQ  (DX)(CX*1), SI
	MOVL  BX, 20(SP)
	MOVQ  $0x0000cf1bbcdcbf9b, R8
	MOVQ  $0x9e3779b1, BX
	MOVQ  SI, R9
	MOVQ  SI, R10
	SHLQ  $0x10, R9
	IMULQ R8, R9
	SHRQ  $0x36, R9
	SHLQ  $0x20, R10
	IMULQ BX, R10
	SHRQ  $0x38, R10
	MOVL  24(SP)(R9*4), BX
	MOVL  4120(SP)(R10*4), DI
	MOVL  CX, 24(SP)(R9*4)
	MOVL  CX, 4120(SP)(R10*4)
	MOVQ  (DX)(BX*1), R9
	MOVQ  (DX)(DI*1), R10
	CMPQ  R9, SI
	JEQ   candidate_match_encodeBetterBlockAsm8B
	CMPQ  R10, SI
	JNE   no_short_found_encodeBetterBlockAsm8B
	MOVL  DI, BX
	JMP   candidate_match_encodeBetterBlockAsm8B

no_short_found_encodeBetterBlockAsm8B:
	CMPL R9, SI
	JEQ  candidate_match_encodeBetterBlockAsm8B
	CMPL R10, SI
	JEQ  candidateS_match_encodeBetterBlockAsm8B
	MOVL 20(SP), CX
	JMP  search_loop_encodeBetterBlockAsm8B

candidateS_match_encodeBetterBlockAsm8B:
	SHRQ  $0x08, SI
	MOVQ  SI, R9
	SHLQ  $0x10, R9
	IMULQ R8, R9
	SHRQ  $0x36, R9
	MOVL  24(SP)(R9*4), BX
	INCL  CX
	MOVL  CX, 24(SP)(R9*4)
	CMPL  (DX)(BX*1), SI
	JEQ   candidate_match_encodeBetterBlockAsm8B
	DECL  CX
	MOVL  DI, BX

candidate_match_encodeBetterBlockAsm8B:
	MOVL  12(SP), SI
	TESTL BX, BX
	JZ    match_extend_back_end_encodeBetterBlockAsm8B

match_extend_back_loop_encodeBetterBlockAsm8B:
	CMPL CX, SI
	JBE  match_extend_back_end_encodeBetterBlockAsm8B
	MOVB -1(DX)(BX*1), DI
	MOVB -1(DX)(CX*1), R8
	CMPB DI, R8
	JNE  match_extend_back_end_encodeBetterBlockAsm8B
	LEAL -1(CX), CX
	DECL BX
	JZ   match_extend_back_end_encodeBetterBlockAsm8B
	JMP  match_extend_back_loop_encodeBetterBlockAsm8B

match_extend_back_end_encodeBetterBlockAsm8B:
	MOVL CX, SI
	SUBL 12(SP), SI
	LEAQ 3(AX)(SI*1), SI
	CMPQ SI, (SP)
	JB   match_dst_size_check_encodeBetterBlockAsm8B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_dst_size_check_encodeBetterBlockAsm8B:
	MOVL CX, SI
	ADDL $0x04, CX
	ADDL $0x04, BX
	MOVQ src_len+32(FP), DI
	SUBL CX, DI
	LEAQ (DX)(CX*1), R8
	LEAQ (DX)(BX*1), R9

	// matchLen
	XORL R11, R11

matchlen_loopback_16_match_nolit_encodeBetterBlockAsm8B:
	CMPL DI, $0x10
	JB   matchlen_match8_match_nolit_encodeBetterBlockAsm8B
	MOVQ (R8)(R11*1), R10
	MOVQ 8(R8)(R11*1), R12
	XORQ (R9)(R11*1), R10
	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm8B
	XORQ 8(R9)(R11*1), R12
	JNZ  matchlen_bsf_16match_nolit_encodeBetterBlockAsm8B
	LEAL -16(DI), DI
	LEAL 16(R11), R11
	JMP  matchlen_loopback_16_match_nolit_encodeBetterBlockAsm8B

matchlen_bsf_16match_nolit_encodeBetterBlockAsm8B:
#ifdef GOAMD64_v3
	TZCNTQ R12, R12

#else
	BSFQ R12, R12

#endif
	SARQ $0x03, R12
	LEAL 8(R11)(R12*1), R11
	JMP  match_nolit_end_encodeBetterBlockAsm8B

matchlen_match8_match_nolit_encodeBetterBlockAsm8B:
	CMPL DI, $0x08
	JB   matchlen_match4_match_nolit_encodeBetterBlockAsm8B
	MOVQ (R8)(R11*1), R10
	XORQ (R9)(R11*1), R10
	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm8B
	LEAL -8(DI), DI
	LEAL 8(R11), R11
	JMP  matchlen_match4_match_nolit_encodeBetterBlockAsm8B

matchlen_bsf_8_match_nolit_encodeBetterBlockAsm8B:
#ifdef GOAMD64_v3
	TZCNTQ R10, R10

#else
	BSFQ R10, R10

#endif
	SARQ $0x03, R10
	LEAL (R11)(R10*1), R11
	JMP  match_nolit_end_encodeBetterBlockAsm8B

matchlen_match4_match_nolit_encodeBetterBlockAsm8B:
	CMPL DI, $0x04
	JB   matchlen_match2_match_nolit_encodeBetterBlockAsm8B
	MOVL (R8)(R11*1), R10
	CMPL (R9)(R11*1), R10
	JNE  matchlen_match2_match_nolit_encodeBetterBlockAsm8B
	LEAL -4(DI), DI
	LEAL 4(R11), R11

matchlen_match2_match_nolit_encodeBetterBlockAsm8B:
	CMPL DI, $0x01
	JE   matchlen_match1_match_nolit_encodeBetterBlockAsm8B
	JB   match_nolit_end_encodeBetterBlockAsm8B
	MOVW (R8)(R11*1), R10
	CMPW (R9)(R11*1), R10
	JNE  matchlen_match1_match_nolit_encodeBetterBlockAsm8B
	LEAL 2(R11), R11
	SUBL $0x02, DI
	JZ   match_nolit_end_encodeBetterBlockAsm8B

matchlen_match1_match_nolit_encodeBetterBlockAsm8B:
	MOVB (R8)(R11*1), R10
	CMPB (R9)(R11*1), R10
	JNE  match_nolit_end_encodeBetterBlockAsm8B
	LEAL 1(R11), R11

match_nolit_end_encodeBetterBlockAsm8B:
	MOVL CX, DI
	SUBL BX, DI

	// Check if repeat
	CMPL 16(SP), DI
	JEQ  match_is_repeat_encodeBetterBlockAsm8B
	MOVL DI, 16(SP)
	MOVL 12(SP), BX
	CMPL BX, SI
	JEQ  emit_literal_done_match_emit_encodeBetterBlockAsm8B
	MOVL SI, R8
	MOVL SI, 12(SP)
	LEAQ (DX)(BX*1), R9
	SUBL BX, R8
	LEAL -1(R8), BX
	CMPL BX, $0x3c
	JB   one_byte_match_emit_encodeBetterBlockAsm8B
	CMPL BX, $0x00000100
	JB   two_bytes_match_emit_encodeBetterBlockAsm8B
	JB   three_bytes_match_emit_encodeBetterBlockAsm8B

three_bytes_match_emit_encodeBetterBlockAsm8B:
	MOVB $0xf4, (AX)
	MOVW BX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_encodeBetterBlockAsm8B

two_bytes_match_emit_encodeBetterBlockAsm8B:
	MOVB $0xf0, (AX)
	MOVB BL, 1(AX)
	ADDQ $0x02, AX
	CMPL BX, $0x40
	JB   memmove_match_emit_encodeBetterBlockAsm8B
	JMP  memmove_long_match_emit_encodeBetterBlockAsm8B

one_byte_match_emit_encodeBetterBlockAsm8B:
	SHLB $0x02, BL
	MOVB BL, (AX)
	ADDQ $0x01, AX

memmove_match_emit_encodeBetterBlockAsm8B:
	LEAQ (AX)(R8*1), BX

	// genMemMoveShort
	CMPQ R8, $0x04
	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4
	CMPQ R8, $0x08
	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64

emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4:
	MOVL (R9), R10
	MOVL R10, (AX)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm8B

emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7:
	MOVL (R9), R10
	MOVL -4(R9)(R8*1), R9
	MOVL R10, (AX)
	MOVL R9, -4(AX)(R8*1)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm8B

emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16:
	MOVQ (R9), R10
	MOVQ -8(R9)(R8*1), R9
	MOVQ R10, (AX)
	MOVQ R9, -8(AX)(R8*1)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm8B

emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32:
	MOVOU (R9), X0
	MOVOU -16(R9)(R8*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R8*1)
	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm8B

emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64:
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)

memmove_end_copy_match_emit_encodeBetterBlockAsm8B:
	MOVQ BX, AX
	JMP  emit_literal_done_match_emit_encodeBetterBlockAsm8B

memmove_long_match_emit_encodeBetterBlockAsm8B:
	LEAQ (AX)(R8*1), BX

	// genMemMoveLong
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVQ  R8, R12
	SHRQ  $0x05, R12
	MOVQ  AX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R13
	SUBQ  R10, R13
	DECQ  R12
	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
	LEAQ  -32(R9)(R13*1), R10
	LEAQ  -32(AX)(R13*1), R14

emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R14)
	MOVOA X5, 16(R14)
	ADDQ  $0x20, R14
	ADDQ  $0x20, R10
	ADDQ  $0x20, R13
	DECQ  R12
	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
	MOVOU -32(R9)(R13*1), X4
	MOVOU -16(R9)(R13*1), X5
	MOVOA X4, -32(AX)(R13*1)
	MOVOA X5, -16(AX)(R13*1)
	ADDQ  $0x20, R13
	CMPQ  R8, R13
	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)
	MOVQ  BX, AX

emit_literal_done_match_emit_encodeBetterBlockAsm8B:
	ADDL R11, CX
	ADDL $0x04, R11
	MOVL CX, 12(SP)

	// emitCopy
	CMPL R11, $0x40
	JBE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B
	CMPL DI, $0x00000800
	JAE  long_offset_short_match_nolit_encodeBetterBlockAsm8B
	MOVL $0x00000001, BX
	LEAL 16(BX), BX
	MOVB DI, 1(AX)
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, BX
	MOVB BL, (AX)
	ADDQ $0x02, AX
	SUBL $0x08, R11

	// emitRepeat
	LEAL -4(R11), R11
	JMP  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
	MOVL R11, BX
	LEAL -4(R11), R11
	CMPL BX, $0x08
	JBE  repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
	CMPL BX, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b

cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b:
	CMPL R11, $0x00000104
	JB   repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
	LEAL -256(R11), R11
	MOVW $0x0019, (AX)
	MOVW R11, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B

repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b:
	LEAL -4(R11), R11
	MOVW $0x0015, (AX)
	MOVB R11, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B

repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b:
	SHLL $0x02, R11
	ORL  $0x01, R11
	MOVW R11, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
	XORQ BX, BX
	LEAL 1(BX)(R11*4), R11
	MOVB DI, 1(AX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, R11
	MOVB R11, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B

long_offset_short_match_nolit_encodeBetterBlockAsm8B:
	MOVB $0xee, (AX)
	MOVW DI, 1(AX)
	LEAL -60(R11), R11
	ADDQ $0x03, AX

	// emitRepeat
	MOVL R11, BX
	LEAL -4(R11), R11
	CMPL BX, $0x08
	JBE  repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
	CMPL BX, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short

cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
	CMPL R11, $0x00000104
	JB   repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
	LEAL -256(R11), R11
	MOVW $0x0019, (AX)
	MOVW R11, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B

repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
	LEAL -4(R11), R11
	MOVW $0x0015, (AX)
	MOVB R11, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B

repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
	SHLL $0x02, R11
	ORL  $0x01, R11
	MOVW R11, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
	XORQ BX, BX
	LEAL 1(BX)(R11*4), R11
	MOVB DI, 1(AX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, R11
	MOVB R11, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B

two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B:
	MOVL R11, BX
	SHLL $0x02, BX
	CMPL R11, $0x0c
	JAE  emit_copy_three_match_nolit_encodeBetterBlockAsm8B
	LEAL -15(BX), BX
	MOVB DI, 1(AX)
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, BX
	MOVB BL, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B

emit_copy_three_match_nolit_encodeBetterBlockAsm8B:
	LEAL -2(BX), BX
	MOVB BL, (AX)
	MOVW DI, 1(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B

match_is_repeat_encodeBetterBlockAsm8B:
	MOVL 12(SP), BX
	CMPL BX, SI
	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B
	MOVL SI, DI
	MOVL SI, 12(SP)
	LEAQ (DX)(BX*1), R8
	SUBL BX, DI
	LEAL -1(DI), BX
	CMPL BX, $0x3c
	JB   one_byte_match_emit_repeat_encodeBetterBlockAsm8B
	CMPL BX, $0x00000100
	JB   two_bytes_match_emit_repeat_encodeBetterBlockAsm8B
	JB   three_bytes_match_emit_repeat_encodeBetterBlockAsm8B

three_bytes_match_emit_repeat_encodeBetterBlockAsm8B:
	MOVB $0xf4, (AX)
	MOVW BX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm8B

two_bytes_match_emit_repeat_encodeBetterBlockAsm8B:
	MOVB $0xf0, (AX)
	MOVB BL, 1(AX)
	ADDQ $0x02, AX
	CMPL BX, $0x40
	JB   memmove_match_emit_repeat_encodeBetterBlockAsm8B
	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm8B

one_byte_match_emit_repeat_encodeBetterBlockAsm8B:
	SHLB $0x02, BL
	MOVB BL, (AX)
	ADDQ $0x01, AX

memmove_match_emit_repeat_encodeBetterBlockAsm8B:
	LEAQ (AX)(DI*1), BX

	// genMemMoveShort
	CMPQ DI, $0x04
	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4
	CMPQ DI, $0x08
	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7
	CMPQ DI, $0x10
	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16
	CMPQ DI, $0x20
	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4:
	MOVL (R8), R9
	MOVL R9, (AX)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7:
	MOVL (R8), R9
	MOVL -4(R8)(DI*1), R8
	MOVL R9, (AX)
	MOVL R8, -4(AX)(DI*1)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16:
	MOVQ (R8), R9
	MOVQ -8(R8)(DI*1), R8
	MOVQ R9, (AX)
	MOVQ R8, -8(AX)(DI*1)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32:
	MOVOU (R8), X0
	MOVOU -16(R8)(DI*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(DI*1)
	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64:
	MOVOU (R8), X0
	MOVOU 16(R8), X1
	MOVOU -32(R8)(DI*1), X2
	MOVOU -16(R8)(DI*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(DI*1)
	MOVOU X3, -16(AX)(DI*1)

memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B:
	MOVQ BX, AX
	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B

memmove_long_match_emit_repeat_encodeBetterBlockAsm8B:
	LEAQ (AX)(DI*1), BX

	// genMemMoveLong
	MOVOU (R8), X0
	MOVOU 16(R8), X1
	MOVOU -32(R8)(DI*1), X2
	MOVOU -16(R8)(DI*1), X3
	MOVQ  DI, R10
	SHRQ  $0x05, R10
	MOVQ  AX, R9
	ANDL  $0x0000001f, R9
	MOVQ  $0x00000040, R12
	SUBQ  R9, R12
	DECQ  R10
	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
	LEAQ  -32(R8)(R12*1), R9
	LEAQ  -32(AX)(R12*1), R13

emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back:
	MOVOU (R9), X4
	MOVOU 16(R9), X5
	MOVOA X4, (R13)
	MOVOA X5, 16(R13)
	ADDQ  $0x20, R13
	ADDQ  $0x20, R9
	ADDQ  $0x20, R12
	DECQ  R10
	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back

emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
	MOVOU -32(R8)(R12*1), X4
	MOVOU -16(R8)(R12*1), X5
	MOVOA X4, -32(AX)(R12*1)
	MOVOA X5, -16(AX)(R12*1)
	ADDQ  $0x20, R12
	CMPQ  DI, R12
	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(DI*1)
	MOVOU X3, -16(AX)(DI*1)
	MOVQ  BX, AX

emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B:
	ADDL R11, CX
	ADDL $0x04, R11
	MOVL CX, 12(SP)

	// emitRepeat
	MOVL R11, BX
	LEAL -4(R11), R11
	CMPL BX, $0x08
	JBE  repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B
	CMPL BX, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B

cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B:
	CMPL R11, $0x00000104
	JB   repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B
	LEAL -256(R11), R11
	MOVW $0x0019, (AX)
	MOVW R11, 2(AX)
	ADDQ $0x04, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B

repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B:
	LEAL -4(R11), R11
	MOVW $0x0015, (AX)
	MOVB R11, 2(AX)
	ADDQ $0x03, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B

repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B:
	SHLL $0x02, R11
	ORL  $0x01, R11
	MOVW R11, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
	XORQ BX, BX
	LEAL 1(BX)(R11*4), R11
	MOVB DI, 1(AX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, R11
	MOVB R11, (AX)
	ADDQ $0x02, AX

match_nolit_emitcopy_end_encodeBetterBlockAsm8B:
	CMPL CX, 8(SP)
	JAE  emit_remainder_encodeBetterBlockAsm8B
	CMPQ AX, (SP)
	JB   match_nolit_dst_ok_encodeBetterBlockAsm8B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_nolit_dst_ok_encodeBetterBlockAsm8B:
	MOVQ  $0x0000cf1bbcdcbf9b, BX
	MOVQ  $0x9e3779b1, DI
	LEAQ  1(SI), SI
	LEAQ  -2(CX), R8
	MOVQ  (DX)(SI*1), R9
	MOVQ  1(DX)(SI*1), R10
	MOVQ  (DX)(R8*1), R11
	MOVQ  1(DX)(R8*1), R12
	SHLQ  $0x10, R9
	IMULQ BX, R9
	SHRQ  $0x36, R9
	SHLQ  $0x20, R10
	IMULQ DI, R10
	SHRQ  $0x38, R10
	SHLQ  $0x10, R11
	IMULQ BX, R11
	SHRQ  $0x36, R11
	SHLQ  $0x20, R12
	IMULQ DI, R12
	SHRQ  $0x38, R12
	LEAQ  1(SI), DI
	LEAQ  1(R8), R13
	MOVL  SI, 24(SP)(R9*4)
	MOVL  R8, 24(SP)(R11*4)
	MOVL  DI, 4120(SP)(R10*4)
	MOVL  R13, 4120(SP)(R12*4)
	LEAQ  1(R8)(SI*1), DI
	SHRQ  $0x01, DI
	ADDQ  $0x01, SI
	SUBQ  $0x01, R8

index_loop_encodeBetterBlockAsm8B:
	CMPQ  DI, R8
	JAE   search_loop_encodeBetterBlockAsm8B
	MOVQ  (DX)(SI*1), R9
	MOVQ  (DX)(DI*1), R10
	SHLQ  $0x10, R9
	IMULQ BX, R9
	SHRQ  $0x36, R9
	SHLQ  $0x10, R10
	IMULQ BX, R10
	SHRQ  $0x36, R10
	MOVL  SI, 24(SP)(R9*4)
	MOVL  DI, 24(SP)(R10*4)
	ADDQ  $0x02, SI
	ADDQ  $0x02, DI
	JMP   index_loop_encodeBetterBlockAsm8B

emit_remainder_encodeBetterBlockAsm8B:
	MOVQ src_len+32(FP), CX
	SUBL 12(SP), CX
	LEAQ 3(AX)(CX*1), CX
	CMPQ CX, (SP)
	JB   emit_remainder_ok_encodeBetterBlockAsm8B
	MOVQ $0x00000000, ret+48(FP)
	RET

emit_remainder_ok_encodeBetterBlockAsm8B:
	MOVQ src_len+32(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm8B
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JB   one_byte_emit_remainder_encodeBetterBlockAsm8B
	CMPL DX, $0x00000100
	JB   two_bytes_emit_remainder_encodeBetterBlockAsm8B
	JB   three_bytes_emit_remainder_encodeBetterBlockAsm8B

three_bytes_emit_remainder_encodeBetterBlockAsm8B:
	MOVB $0xf4, (AX)
	MOVW DX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm8B

two_bytes_emit_remainder_encodeBetterBlockAsm8B:
	MOVB $0xf0, (AX)
	MOVB DL, 1(AX)
	ADDQ $0x02, AX
	CMPL DX, $0x40
	JB   memmove_emit_remainder_encodeBetterBlockAsm8B
	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm8B

one_byte_emit_remainder_encodeBetterBlockAsm8B:
	SHLB $0x02, DL
	MOVB DL, (AX)
	ADDQ $0x01, AX

memmove_emit_remainder_encodeBetterBlockAsm8B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x03
	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2
	JE   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2:
	MOVB (CX), SI
	MOVB -1(CX)(BX*1), CL
	MOVB SI, (AX)
	MOVB CL, -1(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3:
	MOVW (CX), SI
	MOVB 2(CX), CL
	MOVW SI, (AX)
	MOVB CL, 2(AX)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7:
	MOVL (CX), SI
	MOVL -4(CX)(BX*1), CX
	MOVL SI, (AX)
	MOVL CX, -4(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(BX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(BX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)

memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B:
	MOVQ DX, AX
	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm8B

memmove_long_emit_remainder_encodeBetterBlockAsm8B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)
	MOVQ  DX, AX

emit_literal_done_emit_remainder_encodeBetterBlockAsm8B:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ AX, ret+48(FP)
	RET

// func encodeSnappyBlockAsm(dst []byte, src []byte) int
// Requires: BMI, SSE2
TEXT ·encodeSnappyBlockAsm(SB), $65560-56
	MOVQ dst_base+0(FP), AX
	MOVQ $0x00000200, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_encodeSnappyBlockAsm:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_encodeSnappyBlockAsm
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), CX
	LEAQ  -9(CX), DX
	LEAQ  -8(CX), BX
	MOVL  BX, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  CX, 16(SP)
	MOVQ  src_base+24(FP), DX

search_loop_encodeSnappyBlockAsm:
	MOVL  CX, BX
	SUBL  12(SP), BX
	SHRL  $0x06, BX
	LEAL  4(CX)(BX*1), BX
	CMPL  BX, 8(SP)
	JAE   emit_remainder_encodeSnappyBlockAsm
	MOVQ  (DX)(CX*1), SI
	MOVL  BX, 20(SP)
	MOVQ  $0x0000cf1bbcdcbf9b, R8
	MOVQ  SI, R9
	MOVQ  SI, R10
	SHRQ  $0x08, R10
	SHLQ  $0x10, R9
	IMULQ R8, R9
	SHRQ  $0x32, R9
	SHLQ  $0x10, R10
	IMULQ R8, R10
	SHRQ  $0x32, R10
	MOVL  24(SP)(R9*4), BX
	MOVL  24(SP)(R10*4), DI
	MOVL  CX, 24(SP)(R9*4)
	LEAL  1(CX), R9
	MOVL  R9, 24(SP)(R10*4)
	MOVQ  SI, R9
	SHRQ  $0x10, R9
	SHLQ  $0x10, R9
	IMULQ R8, R9
	SHRQ  $0x32, R9
	MOVL  CX, R8
	SUBL  16(SP), R8
	MOVL  1(DX)(R8*1), R10
	MOVQ  SI, R8
	SHRQ  $0x08, R8
	CMPL  R8, R10
	JNE   no_repeat_found_encodeSnappyBlockAsm
	LEAL  1(CX), SI
	MOVL  12(SP), BX
	MOVL  SI, DI
	SUBL  16(SP), DI
	JZ    repeat_extend_back_end_encodeSnappyBlockAsm

repeat_extend_back_loop_encodeSnappyBlockAsm:
	CMPL SI, BX
	JBE  repeat_extend_back_end_encodeSnappyBlockAsm
	MOVB -1(DX)(DI*1), R8
	MOVB -1(DX)(SI*1), R9
	CMPB R8, R9
	JNE  repeat_extend_back_end_encodeSnappyBlockAsm
	LEAL -1(SI), SI
	DECL DI
	JNZ  repeat_extend_back_loop_encodeSnappyBlockAsm

repeat_extend_back_end_encodeSnappyBlockAsm:
	MOVL SI, BX
	SUBL 12(SP), BX
	LEAQ 5(AX)(BX*1), BX
	CMPQ BX, (SP)
	JB   repeat_dst_size_check_encodeSnappyBlockAsm
	MOVQ $0x00000000, ret+48(FP)
	RET

repeat_dst_size_check_encodeSnappyBlockAsm:
	MOVL 12(SP), BX
	CMPL BX, SI
	JEQ  emit_literal_done_repeat_emit_encodeSnappyBlockAsm
	MOVL SI, DI
	MOVL SI, 12(SP)
	LEAQ (DX)(BX*1), R8
	SUBL BX, DI
	LEAL -1(DI), BX
	CMPL BX, $0x3c
	JB   one_byte_repeat_emit_encodeSnappyBlockAsm
	CMPL BX, $0x00000100
	JB   two_bytes_repeat_emit_encodeSnappyBlockAsm
	CMPL BX, $0x00010000
	JB   three_bytes_repeat_emit_encodeSnappyBlockAsm
	CMPL BX, $0x01000000
	JB   four_bytes_repeat_emit_encodeSnappyBlockAsm
	MOVB $0xfc, (AX)
	MOVL BX, 1(AX)
	ADDQ $0x05, AX
	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm

four_bytes_repeat_emit_encodeSnappyBlockAsm:
	MOVL BX, R9
	SHRL $0x10, R9
	MOVB $0xf8, (AX)
	MOVW BX, 1(AX)
	MOVB R9, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm

three_bytes_repeat_emit_encodeSnappyBlockAsm:
	MOVB $0xf4, (AX)
	MOVW BX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm

two_bytes_repeat_emit_encodeSnappyBlockAsm:
	MOVB $0xf0, (AX)
	MOVB BL, 1(AX)
	ADDQ $0x02, AX
	CMPL BX, $0x40
	JB   memmove_repeat_emit_encodeSnappyBlockAsm
	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm

one_byte_repeat_emit_encodeSnappyBlockAsm:
	SHLB $0x02, BL
	MOVB BL, (AX)
	ADDQ $0x01, AX

memmove_repeat_emit_encodeSnappyBlockAsm:
	LEAQ (AX)(DI*1), BX

	// genMemMoveShort
	CMPQ DI, $0x08
	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8
	CMPQ DI, $0x10
	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16
	CMPQ DI, $0x20
	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32
	JMP  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8:
	MOVQ (R8), R9
	MOVQ R9, (AX)
	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16:
	MOVQ (R8), R9
	MOVQ -8(R8)(DI*1), R8
	MOVQ R9, (AX)
	MOVQ R8, -8(AX)(DI*1)
	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32:
	MOVOU (R8), X0
	MOVOU -16(R8)(DI*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(DI*1)
	JMP   memmove_end_copy_repeat_emit_encodeSnappyBlockAsm

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64:
	MOVOU (R8), X0
	MOVOU 16(R8), X1
	MOVOU -32(R8)(DI*1), X2
	MOVOU -16(R8)(DI*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(DI*1)
	MOVOU X3, -16(AX)(DI*1)

memmove_end_copy_repeat_emit_encodeSnappyBlockAsm:
	MOVQ BX, AX
	JMP  emit_literal_done_repeat_emit_encodeSnappyBlockAsm

memmove_long_repeat_emit_encodeSnappyBlockAsm:
	LEAQ (AX)(DI*1), BX

	// genMemMoveLong
	MOVOU (R8), X0
	MOVOU 16(R8), X1
	MOVOU -32(R8)(DI*1), X2
	MOVOU -16(R8)(DI*1), X3
	MOVQ  DI, R10
	SHRQ  $0x05, R10
	MOVQ  AX, R9
	ANDL  $0x0000001f, R9
	MOVQ  $0x00000040, R11
	SUBQ  R9, R11
	DECQ  R10
	JA    emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
	LEAQ  -32(R8)(R11*1), R9
	LEAQ  -32(AX)(R11*1), R12

emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back:
	MOVOU (R9), X4
	MOVOU 16(R9), X5
	MOVOA X4, (R12)
	MOVOA X5, 16(R12)
	ADDQ  $0x20, R12
	ADDQ  $0x20, R9
	ADDQ  $0x20, R11
	DECQ  R10
	JNA   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back

emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
	MOVOU -32(R8)(R11*1), X4
	MOVOU -16(R8)(R11*1), X5
	MOVOA X4, -32(AX)(R11*1)
	MOVOA X5, -16(AX)(R11*1)
	ADDQ  $0x20, R11
	CMPQ  DI, R11
	JAE   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(DI*1)
	MOVOU X3, -16(AX)(DI*1)
	MOVQ  BX, AX

emit_literal_done_repeat_emit_encodeSnappyBlockAsm:
	ADDL $0x05, CX
	MOVL CX, BX
	SUBL 16(SP), BX
	MOVQ src_len+32(FP), DI
	SUBL CX, DI
	LEAQ (DX)(CX*1), R8
	LEAQ (DX)(BX*1), BX

	// matchLen
	XORL R10, R10

matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm:
	CMPL DI, $0x10
	JB   matchlen_match8_repeat_extend_encodeSnappyBlockAsm
	MOVQ (R8)(R10*1), R9
	MOVQ 8(R8)(R10*1), R11
	XORQ (BX)(R10*1), R9
	JNZ  matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm
	XORQ 8(BX)(R10*1), R11
	JNZ  matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm
	LEAL -16(DI), DI
	LEAL 16(R10), R10
	JMP  matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm

matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm:
#ifdef GOAMD64_v3
	TZCNTQ R11, R11

#else
	BSFQ R11, R11

#endif
	SARQ $0x03, R11
	LEAL 8(R10)(R11*1), R10
	JMP  repeat_extend_forward_end_encodeSnappyBlockAsm

matchlen_match8_repeat_extend_encodeSnappyBlockAsm:
	CMPL DI, $0x08
	JB   matchlen_match4_repeat_extend_encodeSnappyBlockAsm
	MOVQ (R8)(R10*1), R9
	XORQ (BX)(R10*1), R9
	JNZ  matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm
	LEAL -8(DI), DI
	LEAL 8(R10), R10
	JMP  matchlen_match4_repeat_extend_encodeSnappyBlockAsm

matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm:
#ifdef GOAMD64_v3
	TZCNTQ R9, R9

#else
	BSFQ R9, R9

#endif
	SARQ $0x03, R9
	LEAL (R10)(R9*1), R10
	JMP  repeat_extend_forward_end_encodeSnappyBlockAsm

matchlen_match4_repeat_extend_encodeSnappyBlockAsm:
	CMPL DI, $0x04
	JB   matchlen_match2_repeat_extend_encodeSnappyBlockAsm
	MOVL (R8)(R10*1), R9
	CMPL (BX)(R10*1), R9
	JNE  matchlen_match2_repeat_extend_encodeSnappyBlockAsm
	LEAL -4(DI), DI
	LEAL 4(R10), R10

matchlen_match2_repeat_extend_encodeSnappyBlockAsm:
	CMPL DI, $0x01
	JE   matchlen_match1_repeat_extend_encodeSnappyBlockAsm
	JB   repeat_extend_forward_end_encodeSnappyBlockAsm
	MOVW (R8)(R10*1), R9
	CMPW (BX)(R10*1), R9
	JNE  matchlen_match1_repeat_extend_encodeSnappyBlockAsm
	LEAL 2(R10), R10
	SUBL $0x02, DI
	JZ   repeat_extend_forward_end_encodeSnappyBlockAsm

matchlen_match1_repeat_extend_encodeSnappyBlockAsm:
	MOVB (R8)(R10*1), R9
	CMPB (BX)(R10*1), R9
	JNE  repeat_extend_forward_end_encodeSnappyBlockAsm
	LEAL 1(R10), R10

repeat_extend_forward_end_encodeSnappyBlockAsm:
	ADDL R10, CX
	MOVL CX, BX
	SUBL SI, BX
	MOVL 16(SP), SI

	// emitCopy
	CMPL SI, $0x00010000
	JB   two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm

four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm:
	CMPL BX, $0x40
	JBE  four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm
	MOVB $0xff, (AX)
	MOVL SI, 1(AX)
	LEAL -64(BX), BX
	ADDQ $0x05, AX
	CMPL BX, $0x04
	JB   four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm
	JMP  four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm

four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm:
	TESTL BX, BX
	JZ    repeat_end_emit_encodeSnappyBlockAsm
	XORL  DI, DI
	LEAL  -1(DI)(BX*4), BX
	MOVB  BL, (AX)
	MOVL  SI, 1(AX)
	ADDQ  $0x05, AX
	JMP   repeat_end_emit_encodeSnappyBlockAsm

two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm:
	CMPL BX, $0x40
	JBE  two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm
	MOVB $0xee, (AX)
	MOVW SI, 1(AX)
	LEAL -60(BX), BX
	ADDQ $0x03, AX
	JMP  two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm

two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm:
	MOVL BX, DI
	SHLL $0x02, DI
	CMPL BX, $0x0c
	JAE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm
	CMPL SI, $0x00000800
	JAE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm
	LEAL -15(DI), DI
	MOVB SI, 1(AX)
	SHRL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, DI
	MOVB DI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeSnappyBlockAsm

emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm:
	LEAL -2(DI), DI
	MOVB DI, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX

repeat_end_emit_encodeSnappyBlockAsm:
	MOVL CX, 12(SP)
	JMP  search_loop_encodeSnappyBlockAsm

no_repeat_found_encodeSnappyBlockAsm:
	CMPL (DX)(BX*1), SI
	JEQ  candidate_match_encodeSnappyBlockAsm
	SHRQ $0x08, SI
	MOVL 24(SP)(R9*4), BX
	LEAL 2(CX), R8
	CMPL (DX)(DI*1), SI
	JEQ  candidate2_match_encodeSnappyBlockAsm
	MOVL R8, 24(SP)(R9*4)
	SHRQ $0x08, SI
	CMPL (DX)(BX*1), SI
	JEQ  candidate3_match_encodeSnappyBlockAsm
	MOVL 20(SP), CX
	JMP  search_loop_encodeSnappyBlockAsm

candidate3_match_encodeSnappyBlockAsm:
	ADDL $0x02, CX
	JMP  candidate_match_encodeSnappyBlockAsm

candidate2_match_encodeSnappyBlockAsm:
	MOVL R8, 24(SP)(R9*4)
	INCL CX
	MOVL DI, BX

candidate_match_encodeSnappyBlockAsm:
	MOVL  12(SP), SI
	TESTL BX, BX
	JZ    match_extend_back_end_encodeSnappyBlockAsm

match_extend_back_loop_encodeSnappyBlockAsm:
	CMPL CX, SI
	JBE  match_extend_back_end_encodeSnappyBlockAsm
	MOVB -1(DX)(BX*1), DI
	MOVB -1(DX)(CX*1), R8
	CMPB DI, R8
	JNE  match_extend_back_end_encodeSnappyBlockAsm
	LEAL -1(CX), CX
	DECL BX
	JZ   match_extend_back_end_encodeSnappyBlockAsm
	JMP  match_extend_back_loop_encodeSnappyBlockAsm

match_extend_back_end_encodeSnappyBlockAsm:
	MOVL CX, SI
	SUBL 12(SP), SI
	LEAQ 5(AX)(SI*1), SI
	CMPQ SI, (SP)
	JB   match_dst_size_check_encodeSnappyBlockAsm
	MOVQ $0x00000000, ret+48(FP)
	RET

match_dst_size_check_encodeSnappyBlockAsm:
	MOVL CX, SI
	MOVL 12(SP), DI
	CMPL DI, SI
	JEQ  emit_literal_done_match_emit_encodeSnappyBlockAsm
	MOVL SI, R8
	MOVL SI, 12(SP)
	LEAQ (DX)(DI*1), SI
	SUBL DI, R8
	LEAL -1(R8), DI
	CMPL DI, $0x3c
	JB   one_byte_match_emit_encodeSnappyBlockAsm
	CMPL DI, $0x00000100
	JB   two_bytes_match_emit_encodeSnappyBlockAsm
	CMPL DI, $0x00010000
	JB   three_bytes_match_emit_encodeSnappyBlockAsm
	CMPL DI, $0x01000000
	JB   four_bytes_match_emit_encodeSnappyBlockAsm
	MOVB $0xfc, (AX)
	MOVL DI, 1(AX)
	ADDQ $0x05, AX
	JMP  memmove_long_match_emit_encodeSnappyBlockAsm

four_bytes_match_emit_encodeSnappyBlockAsm:
	MOVL DI, R9
	SHRL $0x10, R9
	MOVB $0xf8, (AX)
	MOVW DI, 1(AX)
	MOVB R9, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_match_emit_encodeSnappyBlockAsm

three_bytes_match_emit_encodeSnappyBlockAsm:
	MOVB $0xf4, (AX)
	MOVW DI, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_encodeSnappyBlockAsm

two_bytes_match_emit_encodeSnappyBlockAsm:
	MOVB $0xf0, (AX)
	MOVB DI, 1(AX)
	ADDQ $0x02, AX
	CMPL DI, $0x40
	JB   memmove_match_emit_encodeSnappyBlockAsm
	JMP  memmove_long_match_emit_encodeSnappyBlockAsm

one_byte_match_emit_encodeSnappyBlockAsm:
	SHLB $0x02, DI
	MOVB DI, (AX)
	ADDQ $0x01, AX

memmove_match_emit_encodeSnappyBlockAsm:
	LEAQ (AX)(R8*1), DI

	// genMemMoveShort
	CMPQ R8, $0x08
	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64

emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8:
	MOVQ (SI), R9
	MOVQ R9, (AX)
	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm

emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16:
	MOVQ (SI), R9
	MOVQ -8(SI)(R8*1), SI
	MOVQ R9, (AX)
	MOVQ SI, -8(AX)(R8*1)
	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm

emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32:
	MOVOU (SI), X0
	MOVOU -16(SI)(R8*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R8*1)
	JMP   memmove_end_copy_match_emit_encodeSnappyBlockAsm

emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64:
	MOVOU (SI), X0
	MOVOU 16(SI), X1
	MOVOU -32(SI)(R8*1), X2
	MOVOU -16(SI)(R8*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)

memmove_end_copy_match_emit_encodeSnappyBlockAsm:
	MOVQ DI, AX
	JMP  emit_literal_done_match_emit_encodeSnappyBlockAsm

memmove_long_match_emit_encodeSnappyBlockAsm:
	LEAQ (AX)(R8*1), DI

	// genMemMoveLong
	MOVOU (SI), X0
	MOVOU 16(SI), X1
	MOVOU -32(SI)(R8*1), X2
	MOVOU -16(SI)(R8*1), X3
	MOVQ  R8, R10
	SHRQ  $0x05, R10
	MOVQ  AX, R9
	ANDL  $0x0000001f, R9
	MOVQ  $0x00000040, R11
	SUBQ  R9, R11
	DECQ  R10
	JA    emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
	LEAQ  -32(SI)(R11*1), R9
	LEAQ  -32(AX)(R11*1), R12

emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back:
	MOVOU (R9), X4
	MOVOU 16(R9), X5
	MOVOA X4, (R12)
	MOVOA X5, 16(R12)
	ADDQ  $0x20, R12
	ADDQ  $0x20, R9
	ADDQ  $0x20, R11
	DECQ  R10
	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
	MOVOU -32(SI)(R11*1), X4
	MOVOU -16(SI)(R11*1), X5
	MOVOA X4, -32(AX)(R11*1)
	MOVOA X5, -16(AX)(R11*1)
	ADDQ  $0x20, R11
	CMPQ  R8, R11
	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)
	MOVQ  DI, AX

emit_literal_done_match_emit_encodeSnappyBlockAsm:
match_nolit_loop_encodeSnappyBlockAsm:
	MOVL CX, SI
	SUBL BX, SI
	MOVL SI, 16(SP)
	ADDL $0x04, CX
	ADDL $0x04, BX
	MOVQ src_len+32(FP), SI
	SUBL CX, SI
	LEAQ (DX)(CX*1), DI
	LEAQ (DX)(BX*1), BX

	// matchLen
	XORL R9, R9

matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm:
	CMPL SI, $0x10
	JB   matchlen_match8_match_nolit_encodeSnappyBlockAsm
	MOVQ (DI)(R9*1), R8
	MOVQ 8(DI)(R9*1), R10
	XORQ (BX)(R9*1), R8
	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm
	XORQ 8(BX)(R9*1), R10
	JNZ  matchlen_bsf_16match_nolit_encodeSnappyBlockAsm
	LEAL -16(SI), SI
	LEAL 16(R9), R9
	JMP  matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm

matchlen_bsf_16match_nolit_encodeSnappyBlockAsm:
#ifdef GOAMD64_v3
	TZCNTQ R10, R10

#else
	BSFQ R10, R10

#endif
	SARQ $0x03, R10
	LEAL 8(R9)(R10*1), R9
	JMP  match_nolit_end_encodeSnappyBlockAsm

matchlen_match8_match_nolit_encodeSnappyBlockAsm:
	CMPL SI, $0x08
	JB   matchlen_match4_match_nolit_encodeSnappyBlockAsm
	MOVQ (DI)(R9*1), R8
	XORQ (BX)(R9*1), R8
	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm
	LEAL -8(SI), SI
	LEAL 8(R9), R9
	JMP  matchlen_match4_match_nolit_encodeSnappyBlockAsm

matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm:
#ifdef GOAMD64_v3
	TZCNTQ R8, R8

#else
	BSFQ R8, R8

#endif
	SARQ $0x03, R8
	LEAL (R9)(R8*1), R9
	JMP  match_nolit_end_encodeSnappyBlockAsm

matchlen_match4_match_nolit_encodeSnappyBlockAsm:
	CMPL SI, $0x04
	JB   matchlen_match2_match_nolit_encodeSnappyBlockAsm
	MOVL (DI)(R9*1), R8
	CMPL (BX)(R9*1), R8
	JNE  matchlen_match2_match_nolit_encodeSnappyBlockAsm
	LEAL -4(SI), SI
	LEAL 4(R9), R9

matchlen_match2_match_nolit_encodeSnappyBlockAsm:
	CMPL SI, $0x01
	JE   matchlen_match1_match_nolit_encodeSnappyBlockAsm
	JB   match_nolit_end_encodeSnappyBlockAsm
	MOVW (DI)(R9*1), R8
	CMPW (BX)(R9*1), R8
	JNE  matchlen_match1_match_nolit_encodeSnappyBlockAsm
	LEAL 2(R9), R9
	SUBL $0x02, SI
	JZ   match_nolit_end_encodeSnappyBlockAsm

matchlen_match1_match_nolit_encodeSnappyBlockAsm:
	MOVB (DI)(R9*1), R8
	CMPB (BX)(R9*1), R8
	JNE  match_nolit_end_encodeSnappyBlockAsm
	LEAL 1(R9), R9

match_nolit_end_encodeSnappyBlockAsm:
	ADDL R9, CX
	MOVL 16(SP), BX
	ADDL $0x04, R9
	MOVL CX, 12(SP)

	// emitCopy
	CMPL BX, $0x00010000
	JB   two_byte_offset_match_nolit_encodeSnappyBlockAsm

four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm:
	CMPL R9, $0x40
	JBE  four_bytes_remain_match_nolit_encodeSnappyBlockAsm
	MOVB $0xff, (AX)
	MOVL BX, 1(AX)
	LEAL -64(R9), R9
	ADDQ $0x05, AX
	CMPL R9, $0x04
	JB   four_bytes_remain_match_nolit_encodeSnappyBlockAsm
	JMP  four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm

four_bytes_remain_match_nolit_encodeSnappyBlockAsm:
	TESTL R9, R9
	JZ    match_nolit_emitcopy_end_encodeSnappyBlockAsm
	XORL  SI, SI
	LEAL  -1(SI)(R9*4), R9
	MOVB  R9, (AX)
	MOVL  BX, 1(AX)
	ADDQ  $0x05, AX
	JMP   match_nolit_emitcopy_end_encodeSnappyBlockAsm

two_byte_offset_match_nolit_encodeSnappyBlockAsm:
	CMPL R9, $0x40
	JBE  two_byte_offset_short_match_nolit_encodeSnappyBlockAsm
	MOVB $0xee, (AX)
	MOVW BX, 1(AX)
	LEAL -60(R9), R9
	ADDQ $0x03, AX
	JMP  two_byte_offset_match_nolit_encodeSnappyBlockAsm

two_byte_offset_short_match_nolit_encodeSnappyBlockAsm:
	MOVL R9, SI
	SHLL $0x02, SI
	CMPL R9, $0x0c
	JAE  emit_copy_three_match_nolit_encodeSnappyBlockAsm
	CMPL BX, $0x00000800
	JAE  emit_copy_three_match_nolit_encodeSnappyBlockAsm
	LEAL -15(SI), SI
	MOVB BL, 1(AX)
	SHRL $0x08, BX
	SHLL $0x05, BX
	ORL  BX, SI
	MOVB SI, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeSnappyBlockAsm

emit_copy_three_match_nolit_encodeSnappyBlockAsm:
	LEAL -2(SI), SI
	MOVB SI, (AX)
	MOVW BX, 1(AX)
	ADDQ $0x03, AX

match_nolit_emitcopy_end_encodeSnappyBlockAsm:
	CMPL CX, 8(SP)
	JAE  emit_remainder_encodeSnappyBlockAsm
	MOVQ -2(DX)(CX*1), SI
	CMPQ AX, (SP)
	JB   match_nolit_dst_ok_encodeSnappyBlockAsm
	MOVQ $0x00000000, ret+48(FP)
	RET

match_nolit_dst_ok_encodeSnappyBlockAsm:
	MOVQ  $0x0000cf1bbcdcbf9b, R8
	MOVQ  SI, DI
	SHRQ  $0x10, SI
	MOVQ  SI, BX
	SHLQ  $0x10, DI
	IMULQ R8, DI
	SHRQ  $0x32, DI
	SHLQ  $0x10, BX
	IMULQ R8, BX
	SHRQ  $0x32, BX
	LEAL  -2(CX), R8
	LEAQ  24(SP)(BX*4), R9
	MOVL  (R9), BX
	MOVL  R8, 24(SP)(DI*4)
	MOVL  CX, (R9)
	CMPL  (DX)(BX*1), SI
	JEQ   match_nolit_loop_encodeSnappyBlockAsm
	INCL  CX
	JMP   search_loop_encodeSnappyBlockAsm

emit_remainder_encodeSnappyBlockAsm:
	MOVQ src_len+32(FP), CX
	SUBL 12(SP), CX
	LEAQ 5(AX)(CX*1), CX
	CMPQ CX, (SP)
	JB   emit_remainder_ok_encodeSnappyBlockAsm
	MOVQ $0x00000000, ret+48(FP)
	RET

emit_remainder_ok_encodeSnappyBlockAsm:
	MOVQ src_len+32(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_encodeSnappyBlockAsm
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JB   one_byte_emit_remainder_encodeSnappyBlockAsm
	CMPL DX, $0x00000100
	JB   two_bytes_emit_remainder_encodeSnappyBlockAsm
	CMPL DX, $0x00010000
	JB   three_bytes_emit_remainder_encodeSnappyBlockAsm
	CMPL DX, $0x01000000
	JB   four_bytes_emit_remainder_encodeSnappyBlockAsm
	MOVB $0xfc, (AX)
	MOVL DX, 1(AX)
	ADDQ $0x05, AX
	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm

four_bytes_emit_remainder_encodeSnappyBlockAsm:
	MOVL DX, BX
	SHRL $0x10, BX
	MOVB $0xf8, (AX)
	MOVW DX, 1(AX)
	MOVB BL, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm

three_bytes_emit_remainder_encodeSnappyBlockAsm:
	MOVB $0xf4, (AX)
	MOVW DX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm

two_bytes_emit_remainder_encodeSnappyBlockAsm:
	MOVB $0xf0, (AX)
	MOVB DL, 1(AX)
	ADDQ $0x02, AX
	CMPL DX, $0x40
	JB   memmove_emit_remainder_encodeSnappyBlockAsm
	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm

one_byte_emit_remainder_encodeSnappyBlockAsm:
	SHLB $0x02, DL
	MOVB DL, (AX)
	ADDQ $0x01, AX

memmove_emit_remainder_encodeSnappyBlockAsm:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x03
	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2
	JE   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2:
	MOVB (CX), SI
	MOVB -1(CX)(BX*1), CL
	MOVB SI, (AX)
	MOVB CL, -1(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3:
	MOVW (CX), SI
	MOVB 2(CX), CL
	MOVW SI, (AX)
	MOVB CL, 2(AX)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7:
	MOVL (CX), SI
	MOVL -4(CX)(BX*1), CX
	MOVL SI, (AX)
	MOVL CX, -4(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(BX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(BX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeSnappyBlockAsm

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)

memmove_end_copy_emit_remainder_encodeSnappyBlockAsm:
	MOVQ DX, AX
	JMP  emit_literal_done_emit_remainder_encodeSnappyBlockAsm

memmove_long_emit_remainder_encodeSnappyBlockAsm:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)
	MOVQ  DX, AX

emit_literal_done_emit_remainder_encodeSnappyBlockAsm:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ AX, ret+48(FP)
	RET

// func encodeSnappyBlockAsm64K(dst []byte, src []byte) int
// Requires: BMI, SSE2
TEXT ·encodeSnappyBlockAsm64K(SB), $65560-56
	MOVQ dst_base+0(FP), AX
	MOVQ $0x00000200, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_encodeSnappyBlockAsm64K:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_encodeSnappyBlockAsm64K
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), CX
	LEAQ  -9(CX), DX
	LEAQ  -8(CX), BX
	MOVL  BX, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  CX, 16(SP)
	MOVQ  src_base+24(FP), DX

search_loop_encodeSnappyBlockAsm64K:
	MOVL  CX, BX
	SUBL  12(SP), BX
	SHRL  $0x06, BX
	LEAL  4(CX)(BX*1), BX
	CMPL  BX, 8(SP)
	JAE   emit_remainder_encodeSnappyBlockAsm64K
	MOVQ  (DX)(CX*1), SI
	MOVL  BX, 20(SP)
	MOVQ  $0x0000cf1bbcdcbf9b, R8
	MOVQ  SI, R9
	MOVQ  SI, R10
	SHRQ  $0x08, R10
	SHLQ  $0x10, R9
	IMULQ R8, R9
	SHRQ  $0x32, R9
	SHLQ  $0x10, R10
	IMULQ R8, R10
	SHRQ  $0x32, R10
	MOVL  24(SP)(R9*4), BX
	MOVL  24(SP)(R10*4), DI
	MOVL  CX, 24(SP)(R9*4)
	LEAL  1(CX), R9
	MOVL  R9, 24(SP)(R10*4)
	MOVQ  SI, R9
	SHRQ  $0x10, R9
	SHLQ  $0x10, R9
	IMULQ R8, R9
	SHRQ  $0x32, R9
	MOVL  CX, R8
	SUBL  16(SP), R8
	MOVL  1(DX)(R8*1), R10
	MOVQ  SI, R8
	SHRQ  $0x08, R8
	CMPL  R8, R10
	JNE   no_repeat_found_encodeSnappyBlockAsm64K
	LEAL  1(CX), SI
	MOVL  12(SP), BX
	MOVL  SI, DI
	SUBL  16(SP), DI
	JZ    repeat_extend_back_end_encodeSnappyBlockAsm64K

repeat_extend_back_loop_encodeSnappyBlockAsm64K:
	CMPL SI, BX
	JBE  repeat_extend_back_end_encodeSnappyBlockAsm64K
	MOVB -1(DX)(DI*1), R8
	MOVB -1(DX)(SI*1), R9
	CMPB R8, R9
	JNE  repeat_extend_back_end_encodeSnappyBlockAsm64K
	LEAL -1(SI), SI
	DECL DI
	JNZ  repeat_extend_back_loop_encodeSnappyBlockAsm64K

repeat_extend_back_end_encodeSnappyBlockAsm64K:
	MOVL SI, BX
	SUBL 12(SP), BX
	LEAQ 3(AX)(BX*1), BX
	CMPQ BX, (SP)
	JB   repeat_dst_size_check_encodeSnappyBlockAsm64K
	MOVQ $0x00000000, ret+48(FP)
	RET

repeat_dst_size_check_encodeSnappyBlockAsm64K:
	MOVL 12(SP), BX
	CMPL BX, SI
	JEQ  emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K
	MOVL SI, DI
	MOVL SI, 12(SP)
	LEAQ (DX)(BX*1), R8
	SUBL BX, DI
	LEAL -1(DI), BX
	CMPL BX, $0x3c
	JB   one_byte_repeat_emit_encodeSnappyBlockAsm64K
	CMPL BX, $0x00000100
	JB   two_bytes_repeat_emit_encodeSnappyBlockAsm64K
	JB   three_bytes_repeat_emit_encodeSnappyBlockAsm64K

three_bytes_repeat_emit_encodeSnappyBlockAsm64K:
	MOVB $0xf4, (AX)
	MOVW BX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm64K

two_bytes_repeat_emit_encodeSnappyBlockAsm64K:
	MOVB $0xf0, (AX)
	MOVB BL, 1(AX)
	ADDQ $0x02, AX
	CMPL BX, $0x40
	JB   memmove_repeat_emit_encodeSnappyBlockAsm64K
	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm64K

one_byte_repeat_emit_encodeSnappyBlockAsm64K:
	SHLB $0x02, BL
	MOVB BL, (AX)
	ADDQ $0x01, AX

memmove_repeat_emit_encodeSnappyBlockAsm64K:
	LEAQ (AX)(DI*1), BX

	// genMemMoveShort
	CMPQ DI, $0x08
	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8
	CMPQ DI, $0x10
	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16
	CMPQ DI, $0x20
	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32
	JMP  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8:
	MOVQ (R8), R9
	MOVQ R9, (AX)
	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16:
	MOVQ (R8), R9
	MOVQ -8(R8)(DI*1), R8
	MOVQ R9, (AX)
	MOVQ R8, -8(AX)(DI*1)
	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32:
	MOVOU (R8), X0
	MOVOU -16(R8)(DI*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(DI*1)
	JMP   memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64:
	MOVOU (R8), X0
	MOVOU 16(R8), X1
	MOVOU -32(R8)(DI*1), X2
	MOVOU -16(R8)(DI*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(DI*1)
	MOVOU X3, -16(AX)(DI*1)

memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K:
	MOVQ BX, AX
	JMP  emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K

memmove_long_repeat_emit_encodeSnappyBlockAsm64K:
	LEAQ (AX)(DI*1), BX

	// genMemMoveLong
	MOVOU (R8), X0
	MOVOU 16(R8), X1
	MOVOU -32(R8)(DI*1), X2
	MOVOU -16(R8)(DI*1), X3
	MOVQ  DI, R10
	SHRQ  $0x05, R10
	MOVQ  AX, R9
	ANDL  $0x0000001f, R9
	MOVQ  $0x00000040, R11
	SUBQ  R9, R11
	DECQ  R10
	JA    emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
	LEAQ  -32(R8)(R11*1), R9
	LEAQ  -32(AX)(R11*1), R12

emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back:
	MOVOU (R9), X4
	MOVOU 16(R9), X5
	MOVOA X4, (R12)
	MOVOA X5, 16(R12)
	ADDQ  $0x20, R12
	ADDQ  $0x20, R9
	ADDQ  $0x20, R11
	DECQ  R10
	JNA   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back

emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
	MOVOU -32(R8)(R11*1), X4
	MOVOU -16(R8)(R11*1), X5
	MOVOA X4, -32(AX)(R11*1)
	MOVOA X5, -16(AX)(R11*1)
	ADDQ  $0x20, R11
	CMPQ  DI, R11
	JAE   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(DI*1)
	MOVOU X3, -16(AX)(DI*1)
	MOVQ  BX, AX

emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K:
	ADDL $0x05, CX
	MOVL CX, BX
	SUBL 16(SP), BX
	MOVQ src_len+32(FP), DI
	SUBL CX, DI
	LEAQ (DX)(CX*1), R8
	LEAQ (DX)(BX*1), BX

	// matchLen
	XORL R10, R10

matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm64K:
	CMPL DI, $0x10
	JB   matchlen_match8_repeat_extend_encodeSnappyBlockAsm64K
	MOVQ (R8)(R10*1), R9
	MOVQ 8(R8)(R10*1), R11
	XORQ (BX)(R10*1), R9
	JNZ  matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm64K
	XORQ 8(BX)(R10*1), R11
	JNZ  matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm64K
	LEAL -16(DI), DI
	LEAL 16(R10), R10
	JMP  matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm64K

matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm64K:
#ifdef GOAMD64_v3
	TZCNTQ R11, R11

#else
	BSFQ R11, R11

#endif
	SARQ $0x03, R11
	LEAL 8(R10)(R11*1), R10
	JMP  repeat_extend_forward_end_encodeSnappyBlockAsm64K

matchlen_match8_repeat_extend_encodeSnappyBlockAsm64K:
	CMPL DI, $0x08
	JB   matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K
	MOVQ (R8)(R10*1), R9
	XORQ (BX)(R10*1), R9
	JNZ  matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm64K
	LEAL -8(DI), DI
	LEAL 8(R10), R10
	JMP  matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K

matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm64K:
#ifdef GOAMD64_v3
	TZCNTQ R9, R9

#else
	BSFQ R9, R9

#endif
	SARQ $0x03, R9
	LEAL (R10)(R9*1), R10
	JMP  repeat_extend_forward_end_encodeSnappyBlockAsm64K

matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K:
	CMPL DI, $0x04
	JB   matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K
	MOVL (R8)(R10*1), R9
	CMPL (BX)(R10*1), R9
	JNE  matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K
	LEAL -4(DI), DI
	LEAL 4(R10), R10

matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K:
	CMPL DI, $0x01
	JE   matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K
	JB   repeat_extend_forward_end_encodeSnappyBlockAsm64K
	MOVW (R8)(R10*1), R9
	CMPW (BX)(R10*1), R9
	JNE  matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K
	LEAL 2(R10), R10
	SUBL $0x02, DI
	JZ   repeat_extend_forward_end_encodeSnappyBlockAsm64K

matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K:
	MOVB (R8)(R10*1), R9
	CMPB (BX)(R10*1), R9
	JNE  repeat_extend_forward_end_encodeSnappyBlockAsm64K
	LEAL 1(R10), R10

repeat_extend_forward_end_encodeSnappyBlockAsm64K:
	ADDL R10, CX
	MOVL CX, BX
	SUBL SI, BX
	MOVL 16(SP), SI

	// emitCopy
two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K:
	CMPL BX, $0x40
	JBE  two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K
	MOVB $0xee, (AX)
	MOVW SI, 1(AX)
	LEAL -60(BX), BX
	ADDQ $0x03, AX
	JMP  two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K

two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K:
	MOVL BX, DI
	SHLL $0x02, DI
	CMPL BX, $0x0c
	JAE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K
	CMPL SI, $0x00000800
	JAE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K
	LEAL -15(DI), DI
	MOVB SI, 1(AX)
	SHRL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, DI
	MOVB DI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeSnappyBlockAsm64K

emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K:
	LEAL -2(DI), DI
	MOVB DI, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX

repeat_end_emit_encodeSnappyBlockAsm64K:
	MOVL CX, 12(SP)
	JMP  search_loop_encodeSnappyBlockAsm64K

no_repeat_found_encodeSnappyBlockAsm64K:
	CMPL (DX)(BX*1), SI
	JEQ  candidate_match_encodeSnappyBlockAsm64K
	SHRQ $0x08, SI
	MOVL 24(SP)(R9*4), BX
	LEAL 2(CX), R8
	CMPL (DX)(DI*1), SI
	JEQ  candidate2_match_encodeSnappyBlockAsm64K
	MOVL R8, 24(SP)(R9*4)
	SHRQ $0x08, SI
	CMPL (DX)(BX*1), SI
	JEQ  candidate3_match_encodeSnappyBlockAsm64K
	MOVL 20(SP), CX
	JMP  search_loop_encodeSnappyBlockAsm64K

candidate3_match_encodeSnappyBlockAsm64K:
	ADDL $0x02, CX
	JMP  candidate_match_encodeSnappyBlockAsm64K

candidate2_match_encodeSnappyBlockAsm64K:
	MOVL R8, 24(SP)(R9*4)
	INCL CX
	MOVL DI, BX

candidate_match_encodeSnappyBlockAsm64K:
	MOVL  12(SP), SI
	TESTL BX, BX
	JZ    match_extend_back_end_encodeSnappyBlockAsm64K

match_extend_back_loop_encodeSnappyBlockAsm64K:
	CMPL CX, SI
	JBE  match_extend_back_end_encodeSnappyBlockAsm64K
	MOVB -1(DX)(BX*1), DI
	MOVB -1(DX)(CX*1), R8
	CMPB DI, R8
	JNE  match_extend_back_end_encodeSnappyBlockAsm64K
	LEAL -1(CX), CX
	DECL BX
	JZ   match_extend_back_end_encodeSnappyBlockAsm64K
	JMP  match_extend_back_loop_encodeSnappyBlockAsm64K

match_extend_back_end_encodeSnappyBlockAsm64K:
	MOVL CX, SI
	SUBL 12(SP), SI
	LEAQ 3(AX)(SI*1), SI
	CMPQ SI, (SP)
	JB   match_dst_size_check_encodeSnappyBlockAsm64K
	MOVQ $0x00000000, ret+48(FP)
	RET

match_dst_size_check_encodeSnappyBlockAsm64K:
	MOVL CX, SI
	MOVL 12(SP), DI
	CMPL DI, SI
	JEQ  emit_literal_done_match_emit_encodeSnappyBlockAsm64K
	MOVL SI, R8
	MOVL SI, 12(SP)
	LEAQ (DX)(DI*1), SI
	SUBL DI, R8
	LEAL -1(R8), DI
	CMPL DI, $0x3c
	JB   one_byte_match_emit_encodeSnappyBlockAsm64K
	CMPL DI, $0x00000100
	JB   two_bytes_match_emit_encodeSnappyBlockAsm64K
	JB   three_bytes_match_emit_encodeSnappyBlockAsm64K

three_bytes_match_emit_encodeSnappyBlockAsm64K:
	MOVB $0xf4, (AX)
	MOVW DI, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_encodeSnappyBlockAsm64K

two_bytes_match_emit_encodeSnappyBlockAsm64K:
	MOVB $0xf0, (AX)
	MOVB DI, 1(AX)
	ADDQ $0x02, AX
	CMPL DI, $0x40
	JB   memmove_match_emit_encodeSnappyBlockAsm64K
	JMP  memmove_long_match_emit_encodeSnappyBlockAsm64K

one_byte_match_emit_encodeSnappyBlockAsm64K:
	SHLB $0x02, DI
	MOVB DI, (AX)
	ADDQ $0x01, AX

memmove_match_emit_encodeSnappyBlockAsm64K:
	LEAQ (AX)(R8*1), DI

	// genMemMoveShort
	CMPQ R8, $0x08
	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64

emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8:
	MOVQ (SI), R9
	MOVQ R9, (AX)
	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm64K

emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16:
	MOVQ (SI), R9
	MOVQ -8(SI)(R8*1), SI
	MOVQ R9, (AX)
	MOVQ SI, -8(AX)(R8*1)
	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm64K

emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32:
	MOVOU (SI), X0
	MOVOU -16(SI)(R8*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R8*1)
	JMP   memmove_end_copy_match_emit_encodeSnappyBlockAsm64K

emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64:
	MOVOU (SI), X0
	MOVOU 16(SI), X1
	MOVOU -32(SI)(R8*1), X2
	MOVOU -16(SI)(R8*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)

memmove_end_copy_match_emit_encodeSnappyBlockAsm64K:
	MOVQ DI, AX
	JMP  emit_literal_done_match_emit_encodeSnappyBlockAsm64K

memmove_long_match_emit_encodeSnappyBlockAsm64K:
	LEAQ (AX)(R8*1), DI

	// genMemMoveLong
	MOVOU (SI), X0
	MOVOU 16(SI), X1
	MOVOU -32(SI)(R8*1), X2
	MOVOU -16(SI)(R8*1), X3
	MOVQ  R8, R10
	SHRQ  $0x05, R10
	MOVQ  AX, R9
	ANDL  $0x0000001f, R9
	MOVQ  $0x00000040, R11
	SUBQ  R9, R11
	DECQ  R10
	JA    emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
	LEAQ  -32(SI)(R11*1), R9
	LEAQ  -32(AX)(R11*1), R12

emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back:
	MOVOU (R9), X4
	MOVOU 16(R9), X5
	MOVOA X4, (R12)
	MOVOA X5, 16(R12)
	ADDQ  $0x20, R12
	ADDQ  $0x20, R9
	ADDQ  $0x20, R11
	DECQ  R10
	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
	MOVOU -32(SI)(R11*1), X4
	MOVOU -16(SI)(R11*1), X5
	MOVOA X4, -32(AX)(R11*1)
	MOVOA X5, -16(AX)(R11*1)
	ADDQ  $0x20, R11
	CMPQ  R8, R11
	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)
	MOVQ  DI, AX

emit_literal_done_match_emit_encodeSnappyBlockAsm64K:
match_nolit_loop_encodeSnappyBlockAsm64K:
	MOVL CX, SI
	SUBL BX, SI
	MOVL SI, 16(SP)
	ADDL $0x04, CX
	ADDL $0x04, BX
	MOVQ src_len+32(FP), SI
	SUBL CX, SI
	LEAQ (DX)(CX*1), DI
	LEAQ (DX)(BX*1), BX

	// matchLen
	XORL R9, R9

matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm64K:
	CMPL SI, $0x10
	JB   matchlen_match8_match_nolit_encodeSnappyBlockAsm64K
	MOVQ (DI)(R9*1), R8
	MOVQ 8(DI)(R9*1), R10
	XORQ (BX)(R9*1), R8
	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm64K
	XORQ 8(BX)(R9*1), R10
	JNZ  matchlen_bsf_16match_nolit_encodeSnappyBlockAsm64K
	LEAL -16(SI), SI
	LEAL 16(R9), R9
	JMP  matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm64K

matchlen_bsf_16match_nolit_encodeSnappyBlockAsm64K:
#ifdef GOAMD64_v3
	TZCNTQ R10, R10

#else
	BSFQ R10, R10

#endif
	SARQ $0x03, R10
	LEAL 8(R9)(R10*1), R9
	JMP  match_nolit_end_encodeSnappyBlockAsm64K

matchlen_match8_match_nolit_encodeSnappyBlockAsm64K:
	CMPL SI, $0x08
	JB   matchlen_match4_match_nolit_encodeSnappyBlockAsm64K
	MOVQ (DI)(R9*1), R8
	XORQ (BX)(R9*1), R8
	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm64K
	LEAL -8(SI), SI
	LEAL 8(R9), R9
	JMP  matchlen_match4_match_nolit_encodeSnappyBlockAsm64K

matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm64K:
#ifdef GOAMD64_v3
	TZCNTQ R8, R8

#else
	BSFQ R8, R8

#endif
	SARQ $0x03, R8
	LEAL (R9)(R8*1), R9
	JMP  match_nolit_end_encodeSnappyBlockAsm64K

matchlen_match4_match_nolit_encodeSnappyBlockAsm64K:
	CMPL SI, $0x04
	JB   matchlen_match2_match_nolit_encodeSnappyBlockAsm64K
	MOVL (DI)(R9*1), R8
	CMPL (BX)(R9*1), R8
	JNE  matchlen_match2_match_nolit_encodeSnappyBlockAsm64K
	LEAL -4(SI), SI
	LEAL 4(R9), R9

matchlen_match2_match_nolit_encodeSnappyBlockAsm64K:
	CMPL SI, $0x01
	JE   matchlen_match1_match_nolit_encodeSnappyBlockAsm64K
	JB   match_nolit_end_encodeSnappyBlockAsm64K
	MOVW (DI)(R9*1), R8
	CMPW (BX)(R9*1), R8
	JNE  matchlen_match1_match_nolit_encodeSnappyBlockAsm64K
	LEAL 2(R9), R9
	SUBL $0x02, SI
	JZ   match_nolit_end_encodeSnappyBlockAsm64K

matchlen_match1_match_nolit_encodeSnappyBlockAsm64K:
	MOVB (DI)(R9*1), R8
	CMPB (BX)(R9*1), R8
	JNE  match_nolit_end_encodeSnappyBlockAsm64K
	LEAL 1(R9), R9

match_nolit_end_encodeSnappyBlockAsm64K:
	ADDL R9, CX
	MOVL 16(SP), BX
	ADDL $0x04, R9
	MOVL CX, 12(SP)

	// emitCopy
two_byte_offset_match_nolit_encodeSnappyBlockAsm64K:
	CMPL R9, $0x40
	JBE  two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K
	MOVB $0xee, (AX)
	MOVW BX, 1(AX)
	LEAL -60(R9), R9
	ADDQ $0x03, AX
	JMP  two_byte_offset_match_nolit_encodeSnappyBlockAsm64K

two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K:
	MOVL R9, SI
	SHLL $0x02, SI
	CMPL R9, $0x0c
	JAE  emit_copy_three_match_nolit_encodeSnappyBlockAsm64K
	CMPL BX, $0x00000800
	JAE  emit_copy_three_match_nolit_encodeSnappyBlockAsm64K
	LEAL -15(SI), SI
	MOVB BL, 1(AX)
	SHRL $0x08, BX
	SHLL $0x05, BX
	ORL  BX, SI
	MOVB SI, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeSnappyBlockAsm64K

emit_copy_three_match_nolit_encodeSnappyBlockAsm64K:
	LEAL -2(SI), SI
	MOVB SI, (AX)
	MOVW BX, 1(AX)
	ADDQ $0x03, AX

match_nolit_emitcopy_end_encodeSnappyBlockAsm64K:
	CMPL CX, 8(SP)
	JAE  emit_remainder_encodeSnappyBlockAsm64K
	MOVQ -2(DX)(CX*1), SI
	CMPQ AX, (SP)
	JB   match_nolit_dst_ok_encodeSnappyBlockAsm64K
	MOVQ $0x00000000, ret+48(FP)
	RET

match_nolit_dst_ok_encodeSnappyBlockAsm64K:
	MOVQ  $0x0000cf1bbcdcbf9b, R8
	MOVQ  SI, DI
	SHRQ  $0x10, SI
	MOVQ  SI, BX
	SHLQ  $0x10, DI
	IMULQ R8, DI
	SHRQ  $0x32, DI
	SHLQ  $0x10, BX
	IMULQ R8, BX
	SHRQ  $0x32, BX
	LEAL  -2(CX), R8
	LEAQ  24(SP)(BX*4), R9
	MOVL  (R9), BX
	MOVL  R8, 24(SP)(DI*4)
	MOVL  CX, (R9)
	CMPL  (DX)(BX*1), SI
	JEQ   match_nolit_loop_encodeSnappyBlockAsm64K
	INCL  CX
	JMP   search_loop_encodeSnappyBlockAsm64K

emit_remainder_encodeSnappyBlockAsm64K:
	MOVQ src_len+32(FP), CX
	SUBL 12(SP), CX
	LEAQ 3(AX)(CX*1), CX
	CMPQ CX, (SP)
	JB   emit_remainder_ok_encodeSnappyBlockAsm64K
	MOVQ $0x00000000, ret+48(FP)
	RET

emit_remainder_ok_encodeSnappyBlockAsm64K:
	MOVQ src_len+32(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JB   one_byte_emit_remainder_encodeSnappyBlockAsm64K
	CMPL DX, $0x00000100
	JB   two_bytes_emit_remainder_encodeSnappyBlockAsm64K
	JB   three_bytes_emit_remainder_encodeSnappyBlockAsm64K

three_bytes_emit_remainder_encodeSnappyBlockAsm64K:
	MOVB $0xf4, (AX)
	MOVW DX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm64K

two_bytes_emit_remainder_encodeSnappyBlockAsm64K:
	MOVB $0xf0, (AX)
	MOVB DL, 1(AX)
	ADDQ $0x02, AX
	CMPL DX, $0x40
	JB   memmove_emit_remainder_encodeSnappyBlockAsm64K
	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm64K

one_byte_emit_remainder_encodeSnappyBlockAsm64K:
	SHLB $0x02, DL
	MOVB DL, (AX)
	ADDQ $0x01, AX

memmove_emit_remainder_encodeSnappyBlockAsm64K:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x03
	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_1or2
	JE   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_3
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_1or2:
	MOVB (CX), SI
	MOVB -1(CX)(BX*1), CL
	MOVB SI, (AX)
	MOVB CL, -1(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_3:
	MOVW (CX), SI
	MOVB 2(CX), CL
	MOVW SI, (AX)
	MOVB CL, 2(AX)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_4through7:
	MOVL (CX), SI
	MOVL -4(CX)(BX*1), CX
	MOVL SI, (AX)
	MOVL CX, -4(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(BX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(BX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)

memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K:
	MOVQ DX, AX
	JMP  emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K

memmove_long_emit_remainder_encodeSnappyBlockAsm64K:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)
	MOVQ  DX, AX

emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ AX, ret+48(FP)
	RET

// func encodeSnappyBlockAsm12B(dst []byte, src []byte) int
// Requires: BMI, SSE2
TEXT ·encodeSnappyBlockAsm12B(SB), $16408-56
	MOVQ dst_base+0(FP), AX
	MOVQ $0x00000080, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_encodeSnappyBlockAsm12B:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_encodeSnappyBlockAsm12B
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), CX
	LEAQ  -9(CX), DX
	LEAQ  -8(CX), BX
	MOVL  BX, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  CX, 16(SP)
	MOVQ  src_base+24(FP), DX

search_loop_encodeSnappyBlockAsm12B:
	MOVL  CX, BX
	SUBL  12(SP), BX
	SHRL  $0x05, BX
	LEAL  4(CX)(BX*1), BX
	CMPL  BX, 8(SP)
	JAE   emit_remainder_encodeSnappyBlockAsm12B
	MOVQ  (DX)(CX*1), SI
	MOVL  BX, 20(SP)
	MOVQ  $0x000000cf1bbcdcbb, R8
	MOVQ  SI, R9
	MOVQ  SI, R10
	SHRQ  $0x08, R10
	SHLQ  $0x18, R9
	IMULQ R8, R9
	SHRQ  $0x34, R9
	SHLQ  $0x18, R10
	IMULQ R8, R10
	SHRQ  $0x34, R10
	MOVL  24(SP)(R9*4), BX
	MOVL  24(SP)(R10*4), DI
	MOVL  CX, 24(SP)(R9*4)
	LEAL  1(CX), R9
	MOVL  R9, 24(SP)(R10*4)
	MOVQ  SI, R9
	SHRQ  $0x10, R9
	SHLQ  $0x18, R9
	IMULQ R8, R9
	SHRQ  $0x34, R9
	MOVL  CX, R8
	SUBL  16(SP), R8
	MOVL  1(DX)(R8*1), R10
	MOVQ  SI, R8
	SHRQ  $0x08, R8
	CMPL  R8, R10
	JNE   no_repeat_found_encodeSnappyBlockAsm12B
	LEAL  1(CX), SI
	MOVL  12(SP), BX
	MOVL  SI, DI
	SUBL  16(SP), DI
	JZ    repeat_extend_back_end_encodeSnappyBlockAsm12B

repeat_extend_back_loop_encodeSnappyBlockAsm12B:
	CMPL SI, BX
	JBE  repeat_extend_back_end_encodeSnappyBlockAsm12B
	MOVB -1(DX)(DI*1), R8
	MOVB -1(DX)(SI*1), R9
	CMPB R8, R9
	JNE  repeat_extend_back_end_encodeSnappyBlockAsm12B
	LEAL -1(SI), SI
	DECL DI
	JNZ  repeat_extend_back_loop_encodeSnappyBlockAsm12B

repeat_extend_back_end_encodeSnappyBlockAsm12B:
	MOVL SI, BX
	SUBL 12(SP), BX
	LEAQ 3(AX)(BX*1), BX
	CMPQ BX, (SP)
	JB   repeat_dst_size_check_encodeSnappyBlockAsm12B
	MOVQ $0x00000000, ret+48(FP)
	RET

repeat_dst_size_check_encodeSnappyBlockAsm12B:
	MOVL 12(SP), BX
	CMPL BX, SI
	JEQ  emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B
	MOVL SI, DI
	MOVL SI, 12(SP)
	LEAQ (DX)(BX*1), R8
	SUBL BX, DI
	LEAL -1(DI), BX
	CMPL BX, $0x3c
	JB   one_byte_repeat_emit_encodeSnappyBlockAsm12B
	CMPL BX, $0x00000100
	JB   two_bytes_repeat_emit_encodeSnappyBlockAsm12B
	JB   three_bytes_repeat_emit_encodeSnappyBlockAsm12B

three_bytes_repeat_emit_encodeSnappyBlockAsm12B:
	MOVB $0xf4, (AX)
	MOVW BX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm12B

two_bytes_repeat_emit_encodeSnappyBlockAsm12B:
	MOVB $0xf0, (AX)
	MOVB BL, 1(AX)
	ADDQ $0x02, AX
	CMPL BX, $0x40
	JB   memmove_repeat_emit_encodeSnappyBlockAsm12B
	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm12B

one_byte_repeat_emit_encodeSnappyBlockAsm12B:
	SHLB $0x02, BL
	MOVB BL, (AX)
	ADDQ $0x01, AX

memmove_repeat_emit_encodeSnappyBlockAsm12B:
	LEAQ (AX)(DI*1), BX

	// genMemMoveShort
	CMPQ DI, $0x08
	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8
	CMPQ DI, $0x10
	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16
	CMPQ DI, $0x20
	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32
	JMP  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8:
	MOVQ (R8), R9
	MOVQ R9, (AX)
	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16:
	MOVQ (R8), R9
	MOVQ -8(R8)(DI*1), R8
	MOVQ R9, (AX)
	MOVQ R8, -8(AX)(DI*1)
	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32:
	MOVOU (R8), X0
	MOVOU -16(R8)(DI*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(DI*1)
	JMP   memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64:
	MOVOU (R8), X0
	MOVOU 16(R8), X1
	MOVOU -32(R8)(DI*1), X2
	MOVOU -16(R8)(DI*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(DI*1)
	MOVOU X3, -16(AX)(DI*1)

memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B:
	MOVQ BX, AX
	JMP  emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B

memmove_long_repeat_emit_encodeSnappyBlockAsm12B:
	LEAQ (AX)(DI*1), BX

	// genMemMoveLong
	MOVOU (R8), X0
	MOVOU 16(R8), X1
	MOVOU -32(R8)(DI*1), X2
	MOVOU -16(R8)(DI*1), X3
	MOVQ  DI, R10
	SHRQ  $0x05, R10
	MOVQ  AX, R9
	ANDL  $0x0000001f, R9
	MOVQ  $0x00000040, R11
	SUBQ  R9, R11
	DECQ  R10
	JA    emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
	LEAQ  -32(R8)(R11*1), R9
	LEAQ  -32(AX)(R11*1), R12

emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back:
	MOVOU (R9), X4
	MOVOU 16(R9), X5
	MOVOA X4, (R12)
	MOVOA X5, 16(R12)
	ADDQ  $0x20, R12
	ADDQ  $0x20, R9
	ADDQ  $0x20, R11
	DECQ  R10
	JNA   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back

emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
	MOVOU -32(R8)(R11*1), X4
	MOVOU -16(R8)(R11*1), X5
	MOVOA X4, -32(AX)(R11*1)
	MOVOA X5, -16(AX)(R11*1)
	ADDQ  $0x20, R11
	CMPQ  DI, R11
	JAE   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(DI*1)
	MOVOU X3, -16(AX)(DI*1)
	MOVQ  BX, AX

emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B:
	ADDL $0x05, CX
	MOVL CX, BX
	SUBL 16(SP), BX
	MOVQ src_len+32(FP), DI
	SUBL CX, DI
	LEAQ (DX)(CX*1), R8
	LEAQ (DX)(BX*1), BX

	// matchLen
	XORL R10, R10

matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm12B:
	CMPL DI, $0x10
	JB   matchlen_match8_repeat_extend_encodeSnappyBlockAsm12B
	MOVQ (R8)(R10*1), R9
	MOVQ 8(R8)(R10*1), R11
	XORQ (BX)(R10*1), R9
	JNZ  matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm12B
	XORQ 8(BX)(R10*1), R11
	JNZ  matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm12B
	LEAL -16(DI), DI
	LEAL 16(R10), R10
	JMP  matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm12B

matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm12B:
#ifdef GOAMD64_v3
	TZCNTQ R11, R11

#else
	BSFQ R11, R11

#endif
	SARQ $0x03, R11
	LEAL 8(R10)(R11*1), R10
	JMP  repeat_extend_forward_end_encodeSnappyBlockAsm12B

matchlen_match8_repeat_extend_encodeSnappyBlockAsm12B:
	CMPL DI, $0x08
	JB   matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B
	MOVQ (R8)(R10*1), R9
	XORQ (BX)(R10*1), R9
	JNZ  matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm12B
	LEAL -8(DI), DI
	LEAL 8(R10), R10
	JMP  matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B

matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm12B:
#ifdef GOAMD64_v3
	TZCNTQ R9, R9

#else
	BSFQ R9, R9

#endif
	SARQ $0x03, R9
	LEAL (R10)(R9*1), R10
	JMP  repeat_extend_forward_end_encodeSnappyBlockAsm12B

matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B:
	CMPL DI, $0x04
	JB   matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B
	MOVL (R8)(R10*1), R9
	CMPL (BX)(R10*1), R9
	JNE  matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B
	LEAL -4(DI), DI
	LEAL 4(R10), R10

matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B:
	CMPL DI, $0x01
	JE   matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B
	JB   repeat_extend_forward_end_encodeSnappyBlockAsm12B
	MOVW (R8)(R10*1), R9
	CMPW (BX)(R10*1), R9
	JNE  matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B
	LEAL 2(R10), R10
	SUBL $0x02, DI
	JZ   repeat_extend_forward_end_encodeSnappyBlockAsm12B

matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B:
	MOVB (R8)(R10*1), R9
	CMPB (BX)(R10*1), R9
	JNE  repeat_extend_forward_end_encodeSnappyBlockAsm12B
	LEAL 1(R10), R10

repeat_extend_forward_end_encodeSnappyBlockAsm12B:
	ADDL R10, CX
	MOVL CX, BX
	SUBL SI, BX
	MOVL 16(SP), SI

	// emitCopy
two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B:
	CMPL BX, $0x40
	JBE  two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B
	MOVB $0xee, (AX)
	MOVW SI, 1(AX)
	LEAL -60(BX), BX
	ADDQ $0x03, AX
	JMP  two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B

two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B:
	MOVL BX, DI
	SHLL $0x02, DI
	CMPL BX, $0x0c
	JAE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B
	CMPL SI, $0x00000800
	JAE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B
	LEAL -15(DI), DI
	MOVB SI, 1(AX)
	SHRL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, DI
	MOVB DI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeSnappyBlockAsm12B

emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B:
	LEAL -2(DI), DI
	MOVB DI, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX

repeat_end_emit_encodeSnappyBlockAsm12B:
	MOVL CX, 12(SP)
	JMP  search_loop_encodeSnappyBlockAsm12B

no_repeat_found_encodeSnappyBlockAsm12B:
	CMPL (DX)(BX*1), SI
	JEQ  candidate_match_encodeSnappyBlockAsm12B
	SHRQ $0x08, SI
	MOVL 24(SP)(R9*4), BX
	LEAL 2(CX), R8
	CMPL (DX)(DI*1), SI
	JEQ  candidate2_match_encodeSnappyBlockAsm12B
	MOVL R8, 24(SP)(R9*4)
	SHRQ $0x08, SI
	CMPL (DX)(BX*1), SI
	JEQ  candidate3_match_encodeSnappyBlockAsm12B
	MOVL 20(SP), CX
	JMP  search_loop_encodeSnappyBlockAsm12B

candidate3_match_encodeSnappyBlockAsm12B:
	ADDL $0x02, CX
	JMP  candidate_match_encodeSnappyBlockAsm12B

candidate2_match_encodeSnappyBlockAsm12B:
	MOVL R8, 24(SP)(R9*4)
	INCL CX
	MOVL DI, BX

candidate_match_encodeSnappyBlockAsm12B:
	MOVL  12(SP), SI
	TESTL BX, BX
	JZ    match_extend_back_end_encodeSnappyBlockAsm12B

match_extend_back_loop_encodeSnappyBlockAsm12B:
	CMPL CX, SI
	JBE  match_extend_back_end_encodeSnappyBlockAsm12B
	MOVB -1(DX)(BX*1), DI
	MOVB -1(DX)(CX*1), R8
	CMPB DI, R8
	JNE  match_extend_back_end_encodeSnappyBlockAsm12B
	LEAL -1(CX), CX
	DECL BX
	JZ   match_extend_back_end_encodeSnappyBlockAsm12B
	JMP  match_extend_back_loop_encodeSnappyBlockAsm12B

match_extend_back_end_encodeSnappyBlockAsm12B:
	MOVL CX, SI
	SUBL 12(SP), SI
	LEAQ 3(AX)(SI*1), SI
	CMPQ SI, (SP)
	JB   match_dst_size_check_encodeSnappyBlockAsm12B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_dst_size_check_encodeSnappyBlockAsm12B:
	MOVL CX, SI
	MOVL 12(SP), DI
	CMPL DI, SI
	JEQ  emit_literal_done_match_emit_encodeSnappyBlockAsm12B
	MOVL SI, R8
	MOVL SI, 12(SP)
	LEAQ (DX)(DI*1), SI
	SUBL DI, R8
	LEAL -1(R8), DI
	CMPL DI, $0x3c
	JB   one_byte_match_emit_encodeSnappyBlockAsm12B
	CMPL DI, $0x00000100
	JB   two_bytes_match_emit_encodeSnappyBlockAsm12B
	JB   three_bytes_match_emit_encodeSnappyBlockAsm12B

three_bytes_match_emit_encodeSnappyBlockAsm12B:
	MOVB $0xf4, (AX)
	MOVW DI, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_encodeSnappyBlockAsm12B

two_bytes_match_emit_encodeSnappyBlockAsm12B:
	MOVB $0xf0, (AX)
	MOVB DI, 1(AX)
	ADDQ $0x02, AX
	CMPL DI, $0x40
	JB   memmove_match_emit_encodeSnappyBlockAsm12B
	JMP  memmove_long_match_emit_encodeSnappyBlockAsm12B

one_byte_match_emit_encodeSnappyBlockAsm12B:
	SHLB $0x02, DI
	MOVB DI, (AX)
	ADDQ $0x01, AX

memmove_match_emit_encodeSnappyBlockAsm12B:
	LEAQ (AX)(R8*1), DI

	// genMemMoveShort
	CMPQ R8, $0x08
	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64

emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8:
	MOVQ (SI), R9
	MOVQ R9, (AX)
	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm12B

emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16:
	MOVQ (SI), R9
	MOVQ -8(SI)(R8*1), SI
	MOVQ R9, (AX)
	MOVQ SI, -8(AX)(R8*1)
	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm12B

emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32:
	MOVOU (SI), X0
	MOVOU -16(SI)(R8*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R8*1)
	JMP   memmove_end_copy_match_emit_encodeSnappyBlockAsm12B

emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64:
	MOVOU (SI), X0
	MOVOU 16(SI), X1
	MOVOU -32(SI)(R8*1), X2
	MOVOU -16(SI)(R8*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)

memmove_end_copy_match_emit_encodeSnappyBlockAsm12B:
	MOVQ DI, AX
	JMP  emit_literal_done_match_emit_encodeSnappyBlockAsm12B

memmove_long_match_emit_encodeSnappyBlockAsm12B:
	LEAQ (AX)(R8*1), DI

	// genMemMoveLong
	MOVOU (SI), X0
	MOVOU 16(SI), X1
	MOVOU -32(SI)(R8*1), X2
	MOVOU -16(SI)(R8*1), X3
	MOVQ  R8, R10
	SHRQ  $0x05, R10
	MOVQ  AX, R9
	ANDL  $0x0000001f, R9
	MOVQ  $0x00000040, R11
	SUBQ  R9, R11
	DECQ  R10
	JA    emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
	LEAQ  -32(SI)(R11*1), R9
	LEAQ  -32(AX)(R11*1), R12

emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back:
	MOVOU (R9), X4
	MOVOU 16(R9), X5
	MOVOA X4, (R12)
	MOVOA X5, 16(R12)
	ADDQ  $0x20, R12
	ADDQ  $0x20, R9
	ADDQ  $0x20, R11
	DECQ  R10
	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
	MOVOU -32(SI)(R11*1), X4
	MOVOU -16(SI)(R11*1), X5
	MOVOA X4, -32(AX)(R11*1)
	MOVOA X5, -16(AX)(R11*1)
	ADDQ  $0x20, R11
	CMPQ  R8, R11
	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)
	MOVQ  DI, AX

emit_literal_done_match_emit_encodeSnappyBlockAsm12B:
match_nolit_loop_encodeSnappyBlockAsm12B:
	MOVL CX, SI
	SUBL BX, SI
	MOVL SI, 16(SP)
	ADDL $0x04, CX
	ADDL $0x04, BX
	MOVQ src_len+32(FP), SI
	SUBL CX, SI
	LEAQ (DX)(CX*1), DI
	LEAQ (DX)(BX*1), BX

	// matchLen
	XORL R9, R9

matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm12B:
	CMPL SI, $0x10
	JB   matchlen_match8_match_nolit_encodeSnappyBlockAsm12B
	MOVQ (DI)(R9*1), R8
	MOVQ 8(DI)(R9*1), R10
	XORQ (BX)(R9*1), R8
	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm12B
	XORQ 8(BX)(R9*1), R10
	JNZ  matchlen_bsf_16match_nolit_encodeSnappyBlockAsm12B
	LEAL -16(SI), SI
	LEAL 16(R9), R9
	JMP  matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm12B

matchlen_bsf_16match_nolit_encodeSnappyBlockAsm12B:
#ifdef GOAMD64_v3
	TZCNTQ R10, R10

#else
	BSFQ R10, R10

#endif
	SARQ $0x03, R10
	LEAL 8(R9)(R10*1), R9
	JMP  match_nolit_end_encodeSnappyBlockAsm12B

matchlen_match8_match_nolit_encodeSnappyBlockAsm12B:
	CMPL SI, $0x08
	JB   matchlen_match4_match_nolit_encodeSnappyBlockAsm12B
	MOVQ (DI)(R9*1), R8
	XORQ (BX)(R9*1), R8
	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm12B
	LEAL -8(SI), SI
	LEAL 8(R9), R9
	JMP  matchlen_match4_match_nolit_encodeSnappyBlockAsm12B

matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm12B:
#ifdef GOAMD64_v3
	TZCNTQ R8, R8

#else
	BSFQ R8, R8

#endif
	SARQ $0x03, R8
	LEAL (R9)(R8*1), R9
	JMP  match_nolit_end_encodeSnappyBlockAsm12B

matchlen_match4_match_nolit_encodeSnappyBlockAsm12B:
	CMPL SI, $0x04
	JB   matchlen_match2_match_nolit_encodeSnappyBlockAsm12B
	MOVL (DI)(R9*1), R8
	CMPL (BX)(R9*1), R8
	JNE  matchlen_match2_match_nolit_encodeSnappyBlockAsm12B
	LEAL -4(SI), SI
	LEAL 4(R9), R9

matchlen_match2_match_nolit_encodeSnappyBlockAsm12B:
	CMPL SI, $0x01
	JE   matchlen_match1_match_nolit_encodeSnappyBlockAsm12B
	JB   match_nolit_end_encodeSnappyBlockAsm12B
	MOVW (DI)(R9*1), R8
	CMPW (BX)(R9*1), R8
	JNE  matchlen_match1_match_nolit_encodeSnappyBlockAsm12B
	LEAL 2(R9), R9
	SUBL $0x02, SI
	JZ   match_nolit_end_encodeSnappyBlockAsm12B

matchlen_match1_match_nolit_encodeSnappyBlockAsm12B:
	MOVB (DI)(R9*1), R8
	CMPB (BX)(R9*1), R8
	JNE  match_nolit_end_encodeSnappyBlockAsm12B
	LEAL 1(R9), R9

match_nolit_end_encodeSnappyBlockAsm12B:
	ADDL R9, CX
	MOVL 16(SP), BX
	ADDL $0x04, R9
	MOVL CX, 12(SP)

	// emitCopy
two_byte_offset_match_nolit_encodeSnappyBlockAsm12B:
	CMPL R9, $0x40
	JBE  two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B
	MOVB $0xee, (AX)
	MOVW BX, 1(AX)
	LEAL -60(R9), R9
	ADDQ $0x03, AX
	JMP  two_byte_offset_match_nolit_encodeSnappyBlockAsm12B

two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B:
	MOVL R9, SI
	SHLL $0x02, SI
	CMPL R9, $0x0c
	JAE  emit_copy_three_match_nolit_encodeSnappyBlockAsm12B
	CMPL BX, $0x00000800
	JAE  emit_copy_three_match_nolit_encodeSnappyBlockAsm12B
	LEAL -15(SI), SI
	MOVB BL, 1(AX)
	SHRL $0x08, BX
	SHLL $0x05, BX
	ORL  BX, SI
	MOVB SI, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeSnappyBlockAsm12B

emit_copy_three_match_nolit_encodeSnappyBlockAsm12B:
	LEAL -2(SI), SI
	MOVB SI, (AX)
	MOVW BX, 1(AX)
	ADDQ $0x03, AX

match_nolit_emitcopy_end_encodeSnappyBlockAsm12B:
	CMPL CX, 8(SP)
	JAE  emit_remainder_encodeSnappyBlockAsm12B
	MOVQ -2(DX)(CX*1), SI
	CMPQ AX, (SP)
	JB   match_nolit_dst_ok_encodeSnappyBlockAsm12B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_nolit_dst_ok_encodeSnappyBlockAsm12B:
	MOVQ  $0x000000cf1bbcdcbb, R8
	MOVQ  SI, DI
	SHRQ  $0x10, SI
	MOVQ  SI, BX
	SHLQ  $0x18, DI
	IMULQ R8, DI
	SHRQ  $0x34, DI
	SHLQ  $0x18, BX
	IMULQ R8, BX
	SHRQ  $0x34, BX
	LEAL  -2(CX), R8
	LEAQ  24(SP)(BX*4), R9
	MOVL  (R9), BX
	MOVL  R8, 24(SP)(DI*4)
	MOVL  CX, (R9)
	CMPL  (DX)(BX*1), SI
	JEQ   match_nolit_loop_encodeSnappyBlockAsm12B
	INCL  CX
	JMP   search_loop_encodeSnappyBlockAsm12B

emit_remainder_encodeSnappyBlockAsm12B:
	MOVQ src_len+32(FP), CX
	SUBL 12(SP), CX
	LEAQ 3(AX)(CX*1), CX
	CMPQ CX, (SP)
	JB   emit_remainder_ok_encodeSnappyBlockAsm12B
	MOVQ $0x00000000, ret+48(FP)
	RET

emit_remainder_ok_encodeSnappyBlockAsm12B:
	MOVQ src_len+32(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JB   one_byte_emit_remainder_encodeSnappyBlockAsm12B
	CMPL DX, $0x00000100
	JB   two_bytes_emit_remainder_encodeSnappyBlockAsm12B
	JB   three_bytes_emit_remainder_encodeSnappyBlockAsm12B

three_bytes_emit_remainder_encodeSnappyBlockAsm12B:
	MOVB $0xf4, (AX)
	MOVW DX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm12B

two_bytes_emit_remainder_encodeSnappyBlockAsm12B:
	MOVB $0xf0, (AX)
	MOVB DL, 1(AX)
	ADDQ $0x02, AX
	CMPL DX, $0x40
	JB   memmove_emit_remainder_encodeSnappyBlockAsm12B
	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm12B

one_byte_emit_remainder_encodeSnappyBlockAsm12B:
	SHLB $0x02, DL
	MOVB DL, (AX)
	ADDQ $0x01, AX

memmove_emit_remainder_encodeSnappyBlockAsm12B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x03
	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2
	JE   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2:
	MOVB (CX), SI
	MOVB -1(CX)(BX*1), CL
	MOVB SI, (AX)
	MOVB CL, -1(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3:
	MOVW (CX), SI
	MOVB 2(CX), CL
	MOVW SI, (AX)
	MOVB CL, 2(AX)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7:
	MOVL (CX), SI
	MOVL -4(CX)(BX*1), CX
	MOVL SI, (AX)
	MOVL CX, -4(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(BX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(BX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)

memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B:
	MOVQ DX, AX
	JMP  emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B

memmove_long_emit_remainder_encodeSnappyBlockAsm12B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)
	MOVQ  DX, AX

emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ AX, ret+48(FP)
	RET

// func encodeSnappyBlockAsm10B(dst []byte, src []byte) int
// Requires: BMI, SSE2
TEXT ·encodeSnappyBlockAsm10B(SB), $4120-56
	MOVQ dst_base+0(FP), AX
	MOVQ $0x00000020, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_encodeSnappyBlockAsm10B:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_encodeSnappyBlockAsm10B
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), CX
	LEAQ  -9(CX), DX
	LEAQ  -8(CX), BX
	MOVL  BX, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  CX, 16(SP)
	MOVQ  src_base+24(FP), DX

search_loop_encodeSnappyBlockAsm10B:
	MOVL  CX, BX
	SUBL  12(SP), BX
	SHRL  $0x05, BX
	LEAL  4(CX)(BX*1), BX
	CMPL  BX, 8(SP)
	JAE   emit_remainder_encodeSnappyBlockAsm10B
	MOVQ  (DX)(CX*1), SI
	MOVL  BX, 20(SP)
	MOVQ  $0x9e3779b1, R8
	MOVQ  SI, R9
	MOVQ  SI, R10
	SHRQ  $0x08, R10
	SHLQ  $0x20, R9
	IMULQ R8, R9
	SHRQ  $0x36, R9
	SHLQ  $0x20, R10
	IMULQ R8, R10
	SHRQ  $0x36, R10
	MOVL  24(SP)(R9*4), BX
	MOVL  24(SP)(R10*4), DI
	MOVL  CX, 24(SP)(R9*4)
	LEAL  1(CX), R9
	MOVL  R9, 24(SP)(R10*4)
	MOVQ  SI, R9
	SHRQ  $0x10, R9
	SHLQ  $0x20, R9
	IMULQ R8, R9
	SHRQ  $0x36, R9
	MOVL  CX, R8
	SUBL  16(SP), R8
	MOVL  1(DX)(R8*1), R10
	MOVQ  SI, R8
	SHRQ  $0x08, R8
	CMPL  R8, R10
	JNE   no_repeat_found_encodeSnappyBlockAsm10B
	LEAL  1(CX), SI
	MOVL  12(SP), BX
	MOVL  SI, DI
	SUBL  16(SP), DI
	JZ    repeat_extend_back_end_encodeSnappyBlockAsm10B

repeat_extend_back_loop_encodeSnappyBlockAsm10B:
	CMPL SI, BX
	JBE  repeat_extend_back_end_encodeSnappyBlockAsm10B
	MOVB -1(DX)(DI*1), R8
	MOVB -1(DX)(SI*1), R9
	CMPB R8, R9
	JNE  repeat_extend_back_end_encodeSnappyBlockAsm10B
	LEAL -1(SI), SI
	DECL DI
	JNZ  repeat_extend_back_loop_encodeSnappyBlockAsm10B

repeat_extend_back_end_encodeSnappyBlockAsm10B:
	MOVL SI, BX
	SUBL 12(SP), BX
	LEAQ 3(AX)(BX*1), BX
	CMPQ BX, (SP)
	JB   repeat_dst_size_check_encodeSnappyBlockAsm10B
	MOVQ $0x00000000, ret+48(FP)
	RET

repeat_dst_size_check_encodeSnappyBlockAsm10B:
	MOVL 12(SP), BX
	CMPL BX, SI
	JEQ  emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B
	MOVL SI, DI
	MOVL SI, 12(SP)
	LEAQ (DX)(BX*1), R8
	SUBL BX, DI
	LEAL -1(DI), BX
	CMPL BX, $0x3c
	JB   one_byte_repeat_emit_encodeSnappyBlockAsm10B
	CMPL BX, $0x00000100
	JB   two_bytes_repeat_emit_encodeSnappyBlockAsm10B
	JB   three_bytes_repeat_emit_encodeSnappyBlockAsm10B

three_bytes_repeat_emit_encodeSnappyBlockAsm10B:
	MOVB $0xf4, (AX)
	MOVW BX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm10B

two_bytes_repeat_emit_encodeSnappyBlockAsm10B:
	MOVB $0xf0, (AX)
	MOVB BL, 1(AX)
	ADDQ $0x02, AX
	CMPL BX, $0x40
	JB   memmove_repeat_emit_encodeSnappyBlockAsm10B
	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm10B

one_byte_repeat_emit_encodeSnappyBlockAsm10B:
	SHLB $0x02, BL
	MOVB BL, (AX)
	ADDQ $0x01, AX

memmove_repeat_emit_encodeSnappyBlockAsm10B:
	LEAQ (AX)(DI*1), BX

	// genMemMoveShort
	CMPQ DI, $0x08
	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8
	CMPQ DI, $0x10
	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16
	CMPQ DI, $0x20
	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32
	JMP  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8:
	MOVQ (R8), R9
	MOVQ R9, (AX)
	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16:
	MOVQ (R8), R9
	MOVQ -8(R8)(DI*1), R8
	MOVQ R9, (AX)
	MOVQ R8, -8(AX)(DI*1)
	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32:
	MOVOU (R8), X0
	MOVOU -16(R8)(DI*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(DI*1)
	JMP   memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64:
	MOVOU (R8), X0
	MOVOU 16(R8), X1
	MOVOU -32(R8)(DI*1), X2
	MOVOU -16(R8)(DI*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(DI*1)
	MOVOU X3, -16(AX)(DI*1)

memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B:
	MOVQ BX, AX
	JMP  emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B

memmove_long_repeat_emit_encodeSnappyBlockAsm10B:
	LEAQ (AX)(DI*1), BX

	// genMemMoveLong
	MOVOU (R8), X0
	MOVOU 16(R8), X1
	MOVOU -32(R8)(DI*1), X2
	MOVOU -16(R8)(DI*1), X3
	MOVQ  DI, R10
	SHRQ  $0x05, R10
	MOVQ  AX, R9
	ANDL  $0x0000001f, R9
	MOVQ  $0x00000040, R11
	SUBQ  R9, R11
	DECQ  R10
	JA    emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
	LEAQ  -32(R8)(R11*1), R9
	LEAQ  -32(AX)(R11*1), R12

emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back:
	MOVOU (R9), X4
	MOVOU 16(R9), X5
	MOVOA X4, (R12)
	MOVOA X5, 16(R12)
	ADDQ  $0x20, R12
	ADDQ  $0x20, R9
	ADDQ  $0x20, R11
	DECQ  R10
	JNA   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back

emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
	MOVOU -32(R8)(R11*1), X4
	MOVOU -16(R8)(R11*1), X5
	MOVOA X4, -32(AX)(R11*1)
	MOVOA X5, -16(AX)(R11*1)
	ADDQ  $0x20, R11
	CMPQ  DI, R11
	JAE   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(DI*1)
	MOVOU X3, -16(AX)(DI*1)
	MOVQ  BX, AX

emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B:
	ADDL $0x05, CX
	MOVL CX, BX
	SUBL 16(SP), BX
	MOVQ src_len+32(FP), DI
	SUBL CX, DI
	LEAQ (DX)(CX*1), R8
	LEAQ (DX)(BX*1), BX

	// matchLen
	XORL R10, R10

matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm10B:
	CMPL DI, $0x10
	JB   matchlen_match8_repeat_extend_encodeSnappyBlockAsm10B
	MOVQ (R8)(R10*1), R9
	MOVQ 8(R8)(R10*1), R11
	XORQ (BX)(R10*1), R9
	JNZ  matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm10B
	XORQ 8(BX)(R10*1), R11
	JNZ  matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm10B
	LEAL -16(DI), DI
	LEAL 16(R10), R10
	JMP  matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm10B

matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm10B:
#ifdef GOAMD64_v3
	TZCNTQ R11, R11

#else
	BSFQ R11, R11

#endif
	SARQ $0x03, R11
	LEAL 8(R10)(R11*1), R10
	JMP  repeat_extend_forward_end_encodeSnappyBlockAsm10B

matchlen_match8_repeat_extend_encodeSnappyBlockAsm10B:
	CMPL DI, $0x08
	JB   matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B
	MOVQ (R8)(R10*1), R9
	XORQ (BX)(R10*1), R9
	JNZ  matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm10B
	LEAL -8(DI), DI
	LEAL 8(R10), R10
	JMP  matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B

matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm10B:
#ifdef GOAMD64_v3
	TZCNTQ R9, R9

#else
	BSFQ R9, R9

#endif
	SARQ $0x03, R9
	LEAL (R10)(R9*1), R10
	JMP  repeat_extend_forward_end_encodeSnappyBlockAsm10B

matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B:
	CMPL DI, $0x04
	JB   matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B
	MOVL (R8)(R10*1), R9
	CMPL (BX)(R10*1), R9
	JNE  matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B
	LEAL -4(DI), DI
	LEAL 4(R10), R10

matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B:
	CMPL DI, $0x01
	JE   matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B
	JB   repeat_extend_forward_end_encodeSnappyBlockAsm10B
	MOVW (R8)(R10*1), R9
	CMPW (BX)(R10*1), R9
	JNE  matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B
	LEAL 2(R10), R10
	SUBL $0x02, DI
	JZ   repeat_extend_forward_end_encodeSnappyBlockAsm10B

matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B:
	MOVB (R8)(R10*1), R9
	CMPB (BX)(R10*1), R9
	JNE  repeat_extend_forward_end_encodeSnappyBlockAsm10B
	LEAL 1(R10), R10

repeat_extend_forward_end_encodeSnappyBlockAsm10B:
	ADDL R10, CX
	MOVL CX, BX
	SUBL SI, BX
	MOVL 16(SP), SI

	// emitCopy
two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B:
	CMPL BX, $0x40
	JBE  two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B
	MOVB $0xee, (AX)
	MOVW SI, 1(AX)
	LEAL -60(BX), BX
	ADDQ $0x03, AX
	JMP  two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B

two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B:
	MOVL BX, DI
	SHLL $0x02, DI
	CMPL BX, $0x0c
	JAE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B
	CMPL SI, $0x00000800
	JAE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B
	LEAL -15(DI), DI
	MOVB SI, 1(AX)
	SHRL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, DI
	MOVB DI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeSnappyBlockAsm10B

emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B:
	LEAL -2(DI), DI
	MOVB DI, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX

repeat_end_emit_encodeSnappyBlockAsm10B:
	MOVL CX, 12(SP)
	JMP  search_loop_encodeSnappyBlockAsm10B

no_repeat_found_encodeSnappyBlockAsm10B:
	CMPL (DX)(BX*1), SI
	JEQ  candidate_match_encodeSnappyBlockAsm10B
	SHRQ $0x08, SI
	MOVL 24(SP)(R9*4), BX
	LEAL 2(CX), R8
	CMPL (DX)(DI*1), SI
	JEQ  candidate2_match_encodeSnappyBlockAsm10B
	MOVL R8, 24(SP)(R9*4)
	SHRQ $0x08, SI
	CMPL (DX)(BX*1), SI
	JEQ  candidate3_match_encodeSnappyBlockAsm10B
	MOVL 20(SP), CX
	JMP  search_loop_encodeSnappyBlockAsm10B

candidate3_match_encodeSnappyBlockAsm10B:
	ADDL $0x02, CX
	JMP  candidate_match_encodeSnappyBlockAsm10B

candidate2_match_encodeSnappyBlockAsm10B:
	MOVL R8, 24(SP)(R9*4)
	INCL CX
	MOVL DI, BX

candidate_match_encodeSnappyBlockAsm10B:
	MOVL  12(SP), SI
	TESTL BX, BX
	JZ    match_extend_back_end_encodeSnappyBlockAsm10B

match_extend_back_loop_encodeSnappyBlockAsm10B:
	CMPL CX, SI
	JBE  match_extend_back_end_encodeSnappyBlockAsm10B
	MOVB -1(DX)(BX*1), DI
	MOVB -1(DX)(CX*1), R8
	CMPB DI, R8
	JNE  match_extend_back_end_encodeSnappyBlockAsm10B
	LEAL -1(CX), CX
	DECL BX
	JZ   match_extend_back_end_encodeSnappyBlockAsm10B
	JMP  match_extend_back_loop_encodeSnappyBlockAsm10B

match_extend_back_end_encodeSnappyBlockAsm10B:
	MOVL CX, SI
	SUBL 12(SP), SI
	LEAQ 3(AX)(SI*1), SI
	CMPQ SI, (SP)
	JB   match_dst_size_check_encodeSnappyBlockAsm10B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_dst_size_check_encodeSnappyBlockAsm10B:
	MOVL CX, SI
	MOVL 12(SP), DI
	CMPL DI, SI
	JEQ  emit_literal_done_match_emit_encodeSnappyBlockAsm10B
	MOVL SI, R8
	MOVL SI, 12(SP)
	LEAQ (DX)(DI*1), SI
	SUBL DI, R8
	LEAL -1(R8), DI
	CMPL DI, $0x3c
	JB   one_byte_match_emit_encodeSnappyBlockAsm10B
	CMPL DI, $0x00000100
	JB   two_bytes_match_emit_encodeSnappyBlockAsm10B
	JB   three_bytes_match_emit_encodeSnappyBlockAsm10B

three_bytes_match_emit_encodeSnappyBlockAsm10B:
	MOVB $0xf4, (AX)
	MOVW DI, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_encodeSnappyBlockAsm10B

two_bytes_match_emit_encodeSnappyBlockAsm10B:
	MOVB $0xf0, (AX)
	MOVB DI, 1(AX)
	ADDQ $0x02, AX
	CMPL DI, $0x40
	JB   memmove_match_emit_encodeSnappyBlockAsm10B
	JMP  memmove_long_match_emit_encodeSnappyBlockAsm10B

one_byte_match_emit_encodeSnappyBlockAsm10B:
	SHLB $0x02, DI
	MOVB DI, (AX)
	ADDQ $0x01, AX

memmove_match_emit_encodeSnappyBlockAsm10B:
	LEAQ (AX)(R8*1), DI

	// genMemMoveShort
	CMPQ R8, $0x08
	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64

emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8:
	MOVQ (SI), R9
	MOVQ R9, (AX)
	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm10B

emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16:
	MOVQ (SI), R9
	MOVQ -8(SI)(R8*1), SI
	MOVQ R9, (AX)
	MOVQ SI, -8(AX)(R8*1)
	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm10B

emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32:
	MOVOU (SI), X0
	MOVOU -16(SI)(R8*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R8*1)
	JMP   memmove_end_copy_match_emit_encodeSnappyBlockAsm10B

emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64:
	MOVOU (SI), X0
	MOVOU 16(SI), X1
	MOVOU -32(SI)(R8*1), X2
	MOVOU -16(SI)(R8*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)

memmove_end_copy_match_emit_encodeSnappyBlockAsm10B:
	MOVQ DI, AX
	JMP  emit_literal_done_match_emit_encodeSnappyBlockAsm10B

memmove_long_match_emit_encodeSnappyBlockAsm10B:
	LEAQ (AX)(R8*1), DI

	// genMemMoveLong
	MOVOU (SI), X0
	MOVOU 16(SI), X1
	MOVOU -32(SI)(R8*1), X2
	MOVOU -16(SI)(R8*1), X3
	MOVQ  R8, R10
	SHRQ  $0x05, R10
	MOVQ  AX, R9
	ANDL  $0x0000001f, R9
	MOVQ  $0x00000040, R11
	SUBQ  R9, R11
	DECQ  R10
	JA    emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
	LEAQ  -32(SI)(R11*1), R9
	LEAQ  -32(AX)(R11*1), R12

emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back:
	MOVOU (R9), X4
	MOVOU 16(R9), X5
	MOVOA X4, (R12)
	MOVOA X5, 16(R12)
	ADDQ  $0x20, R12
	ADDQ  $0x20, R9
	ADDQ  $0x20, R11
	DECQ  R10
	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
	MOVOU -32(SI)(R11*1), X4
	MOVOU -16(SI)(R11*1), X5
	MOVOA X4, -32(AX)(R11*1)
	MOVOA X5, -16(AX)(R11*1)
	ADDQ  $0x20, R11
	CMPQ  R8, R11
	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)
	MOVQ  DI, AX

emit_literal_done_match_emit_encodeSnappyBlockAsm10B:
match_nolit_loop_encodeSnappyBlockAsm10B:
	MOVL CX, SI
	SUBL BX, SI
	MOVL SI, 16(SP)
	ADDL $0x04, CX
	ADDL $0x04, BX
	MOVQ src_len+32(FP), SI
	SUBL CX, SI
	LEAQ (DX)(CX*1), DI
	LEAQ (DX)(BX*1), BX

	// matchLen
	XORL R9, R9

matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm10B:
	CMPL SI, $0x10
	JB   matchlen_match8_match_nolit_encodeSnappyBlockAsm10B
	MOVQ (DI)(R9*1), R8
	MOVQ 8(DI)(R9*1), R10
	XORQ (BX)(R9*1), R8
	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm10B
	XORQ 8(BX)(R9*1), R10
	JNZ  matchlen_bsf_16match_nolit_encodeSnappyBlockAsm10B
	LEAL -16(SI), SI
	LEAL 16(R9), R9
	JMP  matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm10B

matchlen_bsf_16match_nolit_encodeSnappyBlockAsm10B:
#ifdef GOAMD64_v3
	TZCNTQ R10, R10

#else
	BSFQ R10, R10

#endif
	SARQ $0x03, R10
	LEAL 8(R9)(R10*1), R9
	JMP  match_nolit_end_encodeSnappyBlockAsm10B

matchlen_match8_match_nolit_encodeSnappyBlockAsm10B:
	CMPL SI, $0x08
	JB   matchlen_match4_match_nolit_encodeSnappyBlockAsm10B
	MOVQ (DI)(R9*1), R8
	XORQ (BX)(R9*1), R8
	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm10B
	LEAL -8(SI), SI
	LEAL 8(R9), R9
	JMP  matchlen_match4_match_nolit_encodeSnappyBlockAsm10B

matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm10B:
#ifdef GOAMD64_v3
	TZCNTQ R8, R8

#else
	BSFQ R8, R8

#endif
	SARQ $0x03, R8
	LEAL (R9)(R8*1), R9
	JMP  match_nolit_end_encodeSnappyBlockAsm10B

matchlen_match4_match_nolit_encodeSnappyBlockAsm10B:
	CMPL SI, $0x04
	JB   matchlen_match2_match_nolit_encodeSnappyBlockAsm10B
	MOVL (DI)(R9*1), R8
	CMPL (BX)(R9*1), R8
	JNE  matchlen_match2_match_nolit_encodeSnappyBlockAsm10B
	LEAL -4(SI), SI
	LEAL 4(R9), R9

matchlen_match2_match_nolit_encodeSnappyBlockAsm10B:
	CMPL SI, $0x01
	JE   matchlen_match1_match_nolit_encodeSnappyBlockAsm10B
	JB   match_nolit_end_encodeSnappyBlockAsm10B
	MOVW (DI)(R9*1), R8
	CMPW (BX)(R9*1), R8
	JNE  matchlen_match1_match_nolit_encodeSnappyBlockAsm10B
	LEAL 2(R9), R9
	SUBL $0x02, SI
	JZ   match_nolit_end_encodeSnappyBlockAsm10B

matchlen_match1_match_nolit_encodeSnappyBlockAsm10B:
	MOVB (DI)(R9*1), R8
	CMPB (BX)(R9*1), R8
	JNE  match_nolit_end_encodeSnappyBlockAsm10B
	LEAL 1(R9), R9

match_nolit_end_encodeSnappyBlockAsm10B:
	ADDL R9, CX
	MOVL 16(SP), BX
	ADDL $0x04, R9
	MOVL CX, 12(SP)

	// emitCopy
two_byte_offset_match_nolit_encodeSnappyBlockAsm10B:
	CMPL R9, $0x40
	JBE  two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B
	MOVB $0xee, (AX)
	MOVW BX, 1(AX)
	LEAL -60(R9), R9
	ADDQ $0x03, AX
	JMP  two_byte_offset_match_nolit_encodeSnappyBlockAsm10B

two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B:
	MOVL R9, SI
	SHLL $0x02, SI
	CMPL R9, $0x0c
	JAE  emit_copy_three_match_nolit_encodeSnappyBlockAsm10B
	CMPL BX, $0x00000800
	JAE  emit_copy_three_match_nolit_encodeSnappyBlockAsm10B
	LEAL -15(SI), SI
	MOVB BL, 1(AX)
	SHRL $0x08, BX
	SHLL $0x05, BX
	ORL  BX, SI
	MOVB SI, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeSnappyBlockAsm10B

emit_copy_three_match_nolit_encodeSnappyBlockAsm10B:
	LEAL -2(SI), SI
	MOVB SI, (AX)
	MOVW BX, 1(AX)
	ADDQ $0x03, AX

match_nolit_emitcopy_end_encodeSnappyBlockAsm10B:
	CMPL CX, 8(SP)
	JAE  emit_remainder_encodeSnappyBlockAsm10B
	MOVQ -2(DX)(CX*1), SI
	CMPQ AX, (SP)
	JB   match_nolit_dst_ok_encodeSnappyBlockAsm10B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_nolit_dst_ok_encodeSnappyBlockAsm10B:
	MOVQ  $0x9e3779b1, R8
	MOVQ  SI, DI
	SHRQ  $0x10, SI
	MOVQ  SI, BX
	SHLQ  $0x20, DI
	IMULQ R8, DI
	SHRQ  $0x36, DI
	SHLQ  $0x20, BX
	IMULQ R8, BX
	SHRQ  $0x36, BX
	LEAL  -2(CX), R8
	LEAQ  24(SP)(BX*4), R9
	MOVL  (R9), BX
	MOVL  R8, 24(SP)(DI*4)
	MOVL  CX, (R9)
	CMPL  (DX)(BX*1), SI
	JEQ   match_nolit_loop_encodeSnappyBlockAsm10B
	INCL  CX
	JMP   search_loop_encodeSnappyBlockAsm10B

emit_remainder_encodeSnappyBlockAsm10B:
	MOVQ src_len+32(FP), CX
	SUBL 12(SP), CX
	LEAQ 3(AX)(CX*1), CX
	CMPQ CX, (SP)
	JB   emit_remainder_ok_encodeSnappyBlockAsm10B
	MOVQ $0x00000000, ret+48(FP)
	RET

emit_remainder_ok_encodeSnappyBlockAsm10B:
	MOVQ src_len+32(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JB   one_byte_emit_remainder_encodeSnappyBlockAsm10B
	CMPL DX, $0x00000100
	JB   two_bytes_emit_remainder_encodeSnappyBlockAsm10B
	JB   three_bytes_emit_remainder_encodeSnappyBlockAsm10B

three_bytes_emit_remainder_encodeSnappyBlockAsm10B:
	MOVB $0xf4, (AX)
	MOVW DX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm10B

two_bytes_emit_remainder_encodeSnappyBlockAsm10B:
	MOVB $0xf0, (AX)
	MOVB DL, 1(AX)
	ADDQ $0x02, AX
	CMPL DX, $0x40
	JB   memmove_emit_remainder_encodeSnappyBlockAsm10B
	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm10B

one_byte_emit_remainder_encodeSnappyBlockAsm10B:
	SHLB $0x02, DL
	MOVB DL, (AX)
	ADDQ $0x01, AX

memmove_emit_remainder_encodeSnappyBlockAsm10B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x03
	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2
	JE   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2:
	MOVB (CX), SI
	MOVB -1(CX)(BX*1), CL
	MOVB SI, (AX)
	MOVB CL, -1(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3:
	MOVW (CX), SI
	MOVB 2(CX), CL
	MOVW SI, (AX)
	MOVB CL, 2(AX)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7:
	MOVL (CX), SI
	MOVL -4(CX)(BX*1), CX
	MOVL SI, (AX)
	MOVL CX, -4(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(BX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(BX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)

memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B:
	MOVQ DX, AX
	JMP  emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B

memmove_long_emit_remainder_encodeSnappyBlockAsm10B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)
	MOVQ  DX, AX

emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ AX, ret+48(FP)
	RET

// func encodeSnappyBlockAsm8B(dst []byte, src []byte) int
// Requires: BMI, SSE2
TEXT ·encodeSnappyBlockAsm8B(SB), $1048-56
	MOVQ dst_base+0(FP), AX
	MOVQ $0x00000008, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_encodeSnappyBlockAsm8B:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_encodeSnappyBlockAsm8B
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), CX
	LEAQ  -9(CX), DX
	LEAQ  -8(CX), BX
	MOVL  BX, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  CX, 16(SP)
	MOVQ  src_base+24(FP), DX

search_loop_encodeSnappyBlockAsm8B:
	MOVL  CX, BX
	SUBL  12(SP), BX
	SHRL  $0x04, BX
	LEAL  4(CX)(BX*1), BX
	CMPL  BX, 8(SP)
	JAE   emit_remainder_encodeSnappyBlockAsm8B
	MOVQ  (DX)(CX*1), SI
	MOVL  BX, 20(SP)
	MOVQ  $0x9e3779b1, R8
	MOVQ  SI, R9
	MOVQ  SI, R10
	SHRQ  $0x08, R10
	SHLQ  $0x20, R9
	IMULQ R8, R9
	SHRQ  $0x38, R9
	SHLQ  $0x20, R10
	IMULQ R8, R10
	SHRQ  $0x38, R10
	MOVL  24(SP)(R9*4), BX
	MOVL  24(SP)(R10*4), DI
	MOVL  CX, 24(SP)(R9*4)
	LEAL  1(CX), R9
	MOVL  R9, 24(SP)(R10*4)
	MOVQ  SI, R9
	SHRQ  $0x10, R9
	SHLQ  $0x20, R9
	IMULQ R8, R9
	SHRQ  $0x38, R9
	MOVL  CX, R8
	SUBL  16(SP), R8
	MOVL  1(DX)(R8*1), R10
	MOVQ  SI, R8
	SHRQ  $0x08, R8
	CMPL  R8, R10
	JNE   no_repeat_found_encodeSnappyBlockAsm8B
	LEAL  1(CX), SI
	MOVL  12(SP), BX
	MOVL  SI, DI
	SUBL  16(SP), DI
	JZ    repeat_extend_back_end_encodeSnappyBlockAsm8B

repeat_extend_back_loop_encodeSnappyBlockAsm8B:
	CMPL SI, BX
	JBE  repeat_extend_back_end_encodeSnappyBlockAsm8B
	MOVB -1(DX)(DI*1), R8
	MOVB -1(DX)(SI*1), R9
	CMPB R8, R9
	JNE  repeat_extend_back_end_encodeSnappyBlockAsm8B
	LEAL -1(SI), SI
	DECL DI
	JNZ  repeat_extend_back_loop_encodeSnappyBlockAsm8B

repeat_extend_back_end_encodeSnappyBlockAsm8B:
	MOVL SI, BX
	SUBL 12(SP), BX
	LEAQ 3(AX)(BX*1), BX
	CMPQ BX, (SP)
	JB   repeat_dst_size_check_encodeSnappyBlockAsm8B
	MOVQ $0x00000000, ret+48(FP)
	RET

repeat_dst_size_check_encodeSnappyBlockAsm8B:
	MOVL 12(SP), BX
	CMPL BX, SI
	JEQ  emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B
	MOVL SI, DI
	MOVL SI, 12(SP)
	LEAQ (DX)(BX*1), R8
	SUBL BX, DI
	LEAL -1(DI), BX
	CMPL BX, $0x3c
	JB   one_byte_repeat_emit_encodeSnappyBlockAsm8B
	CMPL BX, $0x00000100
	JB   two_bytes_repeat_emit_encodeSnappyBlockAsm8B
	JB   three_bytes_repeat_emit_encodeSnappyBlockAsm8B

three_bytes_repeat_emit_encodeSnappyBlockAsm8B:
	MOVB $0xf4, (AX)
	MOVW BX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm8B

two_bytes_repeat_emit_encodeSnappyBlockAsm8B:
	MOVB $0xf0, (AX)
	MOVB BL, 1(AX)
	ADDQ $0x02, AX
	CMPL BX, $0x40
	JB   memmove_repeat_emit_encodeSnappyBlockAsm8B
	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm8B

one_byte_repeat_emit_encodeSnappyBlockAsm8B:
	SHLB $0x02, BL
	MOVB BL, (AX)
	ADDQ $0x01, AX

memmove_repeat_emit_encodeSnappyBlockAsm8B:
	LEAQ (AX)(DI*1), BX

	// genMemMoveShort
	CMPQ DI, $0x08
	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8
	CMPQ DI, $0x10
	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16
	CMPQ DI, $0x20
	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32
	JMP  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8:
	MOVQ (R8), R9
	MOVQ R9, (AX)
	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16:
	MOVQ (R8), R9
	MOVQ -8(R8)(DI*1), R8
	MOVQ R9, (AX)
	MOVQ R8, -8(AX)(DI*1)
	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32:
	MOVOU (R8), X0
	MOVOU -16(R8)(DI*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(DI*1)
	JMP   memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64:
	MOVOU (R8), X0
	MOVOU 16(R8), X1
	MOVOU -32(R8)(DI*1), X2
	MOVOU -16(R8)(DI*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(DI*1)
	MOVOU X3, -16(AX)(DI*1)

memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B:
	MOVQ BX, AX
	JMP  emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B

memmove_long_repeat_emit_encodeSnappyBlockAsm8B:
	LEAQ (AX)(DI*1), BX

	// genMemMoveLong
	MOVOU (R8), X0
	MOVOU 16(R8), X1
	MOVOU -32(R8)(DI*1), X2
	MOVOU -16(R8)(DI*1), X3
	MOVQ  DI, R10
	SHRQ  $0x05, R10
	MOVQ  AX, R9
	ANDL  $0x0000001f, R9
	MOVQ  $0x00000040, R11
	SUBQ  R9, R11
	DECQ  R10
	JA    emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
	LEAQ  -32(R8)(R11*1), R9
	LEAQ  -32(AX)(R11*1), R12

emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back:
	MOVOU (R9), X4
	MOVOU 16(R9), X5
	MOVOA X4, (R12)
	MOVOA X5, 16(R12)
	ADDQ  $0x20, R12
	ADDQ  $0x20, R9
	ADDQ  $0x20, R11
	DECQ  R10
	JNA   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back

emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
	MOVOU -32(R8)(R11*1), X4
	MOVOU -16(R8)(R11*1), X5
	MOVOA X4, -32(AX)(R11*1)
	MOVOA X5, -16(AX)(R11*1)
	ADDQ  $0x20, R11
	CMPQ  DI, R11
	JAE   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(DI*1)
	MOVOU X3, -16(AX)(DI*1)
	MOVQ  BX, AX

emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B:
	ADDL $0x05, CX
	MOVL CX, BX
	SUBL 16(SP), BX
	MOVQ src_len+32(FP), DI
	SUBL CX, DI
	LEAQ (DX)(CX*1), R8
	LEAQ (DX)(BX*1), BX

	// matchLen
	XORL R10, R10

matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm8B:
	CMPL DI, $0x10
	JB   matchlen_match8_repeat_extend_encodeSnappyBlockAsm8B
	MOVQ (R8)(R10*1), R9
	MOVQ 8(R8)(R10*1), R11
	XORQ (BX)(R10*1), R9
	JNZ  matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm8B
	XORQ 8(BX)(R10*1), R11
	JNZ  matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm8B
	LEAL -16(DI), DI
	LEAL 16(R10), R10
	JMP  matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm8B

matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm8B:
#ifdef GOAMD64_v3
	TZCNTQ R11, R11

#else
	BSFQ R11, R11

#endif
	SARQ $0x03, R11
	LEAL 8(R10)(R11*1), R10
	JMP  repeat_extend_forward_end_encodeSnappyBlockAsm8B

matchlen_match8_repeat_extend_encodeSnappyBlockAsm8B:
	CMPL DI, $0x08
	JB   matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B
	MOVQ (R8)(R10*1), R9
	XORQ (BX)(R10*1), R9
	JNZ  matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm8B
	LEAL -8(DI), DI
	LEAL 8(R10), R10
	JMP  matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B

matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm8B:
#ifdef GOAMD64_v3
	TZCNTQ R9, R9

#else
	BSFQ R9, R9

#endif
	SARQ $0x03, R9
	LEAL (R10)(R9*1), R10
	JMP  repeat_extend_forward_end_encodeSnappyBlockAsm8B

matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B:
	CMPL DI, $0x04
	JB   matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B
	MOVL (R8)(R10*1), R9
	CMPL (BX)(R10*1), R9
	JNE  matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B
	LEAL -4(DI), DI
	LEAL 4(R10), R10

matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B:
	CMPL DI, $0x01
	JE   matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B
	JB   repeat_extend_forward_end_encodeSnappyBlockAsm8B
	MOVW (R8)(R10*1), R9
	CMPW (BX)(R10*1), R9
	JNE  matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B
	LEAL 2(R10), R10
	SUBL $0x02, DI
	JZ   repeat_extend_forward_end_encodeSnappyBlockAsm8B

matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B:
	MOVB (R8)(R10*1), R9
	CMPB (BX)(R10*1), R9
	JNE  repeat_extend_forward_end_encodeSnappyBlockAsm8B
	LEAL 1(R10), R10

repeat_extend_forward_end_encodeSnappyBlockAsm8B:
	ADDL R10, CX
	MOVL CX, BX
	SUBL SI, BX
	MOVL 16(SP), SI

	// emitCopy
two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B:
	CMPL BX, $0x40
	JBE  two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B
	MOVB $0xee, (AX)
	MOVW SI, 1(AX)
	LEAL -60(BX), BX
	ADDQ $0x03, AX
	JMP  two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B

two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B:
	MOVL BX, DI
	SHLL $0x02, DI
	CMPL BX, $0x0c
	JAE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B
	LEAL -15(DI), DI
	MOVB SI, 1(AX)
	SHRL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, DI
	MOVB DI, (AX)
	ADDQ $0x02, AX
	JMP  repeat_end_emit_encodeSnappyBlockAsm8B

emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B:
	LEAL -2(DI), DI
	MOVB DI, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, AX

repeat_end_emit_encodeSnappyBlockAsm8B:
	MOVL CX, 12(SP)
	JMP  search_loop_encodeSnappyBlockAsm8B

no_repeat_found_encodeSnappyBlockAsm8B:
	CMPL (DX)(BX*1), SI
	JEQ  candidate_match_encodeSnappyBlockAsm8B
	SHRQ $0x08, SI
	MOVL 24(SP)(R9*4), BX
	LEAL 2(CX), R8
	CMPL (DX)(DI*1), SI
	JEQ  candidate2_match_encodeSnappyBlockAsm8B
	MOVL R8, 24(SP)(R9*4)
	SHRQ $0x08, SI
	CMPL (DX)(BX*1), SI
	JEQ  candidate3_match_encodeSnappyBlockAsm8B
	MOVL 20(SP), CX
	JMP  search_loop_encodeSnappyBlockAsm8B

candidate3_match_encodeSnappyBlockAsm8B:
	ADDL $0x02, CX
	JMP  candidate_match_encodeSnappyBlockAsm8B

candidate2_match_encodeSnappyBlockAsm8B:
	MOVL R8, 24(SP)(R9*4)
	INCL CX
	MOVL DI, BX

candidate_match_encodeSnappyBlockAsm8B:
	MOVL  12(SP), SI
	TESTL BX, BX
	JZ    match_extend_back_end_encodeSnappyBlockAsm8B

match_extend_back_loop_encodeSnappyBlockAsm8B:
	CMPL CX, SI
	JBE  match_extend_back_end_encodeSnappyBlockAsm8B
	MOVB -1(DX)(BX*1), DI
	MOVB -1(DX)(CX*1), R8
	CMPB DI, R8
	JNE  match_extend_back_end_encodeSnappyBlockAsm8B
	LEAL -1(CX), CX
	DECL BX
	JZ   match_extend_back_end_encodeSnappyBlockAsm8B
	JMP  match_extend_back_loop_encodeSnappyBlockAsm8B

match_extend_back_end_encodeSnappyBlockAsm8B:
	MOVL CX, SI
	SUBL 12(SP), SI
	LEAQ 3(AX)(SI*1), SI
	CMPQ SI, (SP)
	JB   match_dst_size_check_encodeSnappyBlockAsm8B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_dst_size_check_encodeSnappyBlockAsm8B:
	MOVL CX, SI
	MOVL 12(SP), DI
	CMPL DI, SI
	JEQ  emit_literal_done_match_emit_encodeSnappyBlockAsm8B
	MOVL SI, R8
	MOVL SI, 12(SP)
	LEAQ (DX)(DI*1), SI
	SUBL DI, R8
	LEAL -1(R8), DI
	CMPL DI, $0x3c
	JB   one_byte_match_emit_encodeSnappyBlockAsm8B
	CMPL DI, $0x00000100
	JB   two_bytes_match_emit_encodeSnappyBlockAsm8B
	JB   three_bytes_match_emit_encodeSnappyBlockAsm8B

three_bytes_match_emit_encodeSnappyBlockAsm8B:
	MOVB $0xf4, (AX)
	MOVW DI, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_encodeSnappyBlockAsm8B

two_bytes_match_emit_encodeSnappyBlockAsm8B:
	MOVB $0xf0, (AX)
	MOVB DI, 1(AX)
	ADDQ $0x02, AX
	CMPL DI, $0x40
	JB   memmove_match_emit_encodeSnappyBlockAsm8B
	JMP  memmove_long_match_emit_encodeSnappyBlockAsm8B

one_byte_match_emit_encodeSnappyBlockAsm8B:
	SHLB $0x02, DI
	MOVB DI, (AX)
	ADDQ $0x01, AX

memmove_match_emit_encodeSnappyBlockAsm8B:
	LEAQ (AX)(R8*1), DI

	// genMemMoveShort
	CMPQ R8, $0x08
	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64

emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8:
	MOVQ (SI), R9
	MOVQ R9, (AX)
	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm8B

emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16:
	MOVQ (SI), R9
	MOVQ -8(SI)(R8*1), SI
	MOVQ R9, (AX)
	MOVQ SI, -8(AX)(R8*1)
	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm8B

emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32:
	MOVOU (SI), X0
	MOVOU -16(SI)(R8*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R8*1)
	JMP   memmove_end_copy_match_emit_encodeSnappyBlockAsm8B

emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64:
	MOVOU (SI), X0
	MOVOU 16(SI), X1
	MOVOU -32(SI)(R8*1), X2
	MOVOU -16(SI)(R8*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)

memmove_end_copy_match_emit_encodeSnappyBlockAsm8B:
	MOVQ DI, AX
	JMP  emit_literal_done_match_emit_encodeSnappyBlockAsm8B

memmove_long_match_emit_encodeSnappyBlockAsm8B:
	LEAQ (AX)(R8*1), DI

	// genMemMoveLong
	MOVOU (SI), X0
	MOVOU 16(SI), X1
	MOVOU -32(SI)(R8*1), X2
	MOVOU -16(SI)(R8*1), X3
	MOVQ  R8, R10
	SHRQ  $0x05, R10
	MOVQ  AX, R9
	ANDL  $0x0000001f, R9
	MOVQ  $0x00000040, R11
	SUBQ  R9, R11
	DECQ  R10
	JA    emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
	LEAQ  -32(SI)(R11*1), R9
	LEAQ  -32(AX)(R11*1), R12

emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back:
	MOVOU (R9), X4
	MOVOU 16(R9), X5
	MOVOA X4, (R12)
	MOVOA X5, 16(R12)
	ADDQ  $0x20, R12
	ADDQ  $0x20, R9
	ADDQ  $0x20, R11
	DECQ  R10
	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
	MOVOU -32(SI)(R11*1), X4
	MOVOU -16(SI)(R11*1), X5
	MOVOA X4, -32(AX)(R11*1)
	MOVOA X5, -16(AX)(R11*1)
	ADDQ  $0x20, R11
	CMPQ  R8, R11
	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)
	MOVQ  DI, AX

emit_literal_done_match_emit_encodeSnappyBlockAsm8B:
match_nolit_loop_encodeSnappyBlockAsm8B:
	MOVL CX, SI
	SUBL BX, SI
	MOVL SI, 16(SP)
	ADDL $0x04, CX
	ADDL $0x04, BX
	MOVQ src_len+32(FP), SI
	SUBL CX, SI
	LEAQ (DX)(CX*1), DI
	LEAQ (DX)(BX*1), BX

	// matchLen
	XORL R9, R9

matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm8B:
	CMPL SI, $0x10
	JB   matchlen_match8_match_nolit_encodeSnappyBlockAsm8B
	MOVQ (DI)(R9*1), R8
	MOVQ 8(DI)(R9*1), R10
	XORQ (BX)(R9*1), R8
	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm8B
	XORQ 8(BX)(R9*1), R10
	JNZ  matchlen_bsf_16match_nolit_encodeSnappyBlockAsm8B
	LEAL -16(SI), SI
	LEAL 16(R9), R9
	JMP  matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm8B

matchlen_bsf_16match_nolit_encodeSnappyBlockAsm8B:
#ifdef GOAMD64_v3
	TZCNTQ R10, R10

#else
	BSFQ R10, R10

#endif
	SARQ $0x03, R10
	LEAL 8(R9)(R10*1), R9
	JMP  match_nolit_end_encodeSnappyBlockAsm8B

matchlen_match8_match_nolit_encodeSnappyBlockAsm8B:
	CMPL SI, $0x08
	JB   matchlen_match4_match_nolit_encodeSnappyBlockAsm8B
	MOVQ (DI)(R9*1), R8
	XORQ (BX)(R9*1), R8
	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm8B
	LEAL -8(SI), SI
	LEAL 8(R9), R9
	JMP  matchlen_match4_match_nolit_encodeSnappyBlockAsm8B

matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm8B:
#ifdef GOAMD64_v3
	TZCNTQ R8, R8

#else
	BSFQ R8, R8

#endif
	SARQ $0x03, R8
	LEAL (R9)(R8*1), R9
	JMP  match_nolit_end_encodeSnappyBlockAsm8B

matchlen_match4_match_nolit_encodeSnappyBlockAsm8B:
	CMPL SI, $0x04
	JB   matchlen_match2_match_nolit_encodeSnappyBlockAsm8B
	MOVL (DI)(R9*1), R8
	CMPL (BX)(R9*1), R8
	JNE  matchlen_match2_match_nolit_encodeSnappyBlockAsm8B
	LEAL -4(SI), SI
	LEAL 4(R9), R9

matchlen_match2_match_nolit_encodeSnappyBlockAsm8B:
	CMPL SI, $0x01
	JE   matchlen_match1_match_nolit_encodeSnappyBlockAsm8B
	JB   match_nolit_end_encodeSnappyBlockAsm8B
	MOVW (DI)(R9*1), R8
	CMPW (BX)(R9*1), R8
	JNE  matchlen_match1_match_nolit_encodeSnappyBlockAsm8B
	LEAL 2(R9), R9
	SUBL $0x02, SI
	JZ   match_nolit_end_encodeSnappyBlockAsm8B

matchlen_match1_match_nolit_encodeSnappyBlockAsm8B:
	MOVB (DI)(R9*1), R8
	CMPB (BX)(R9*1), R8
	JNE  match_nolit_end_encodeSnappyBlockAsm8B
	LEAL 1(R9), R9

match_nolit_end_encodeSnappyBlockAsm8B:
	ADDL R9, CX
	MOVL 16(SP), BX
	ADDL $0x04, R9
	MOVL CX, 12(SP)

	// emitCopy
two_byte_offset_match_nolit_encodeSnappyBlockAsm8B:
	CMPL R9, $0x40
	JBE  two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B
	MOVB $0xee, (AX)
	MOVW BX, 1(AX)
	LEAL -60(R9), R9
	ADDQ $0x03, AX
	JMP  two_byte_offset_match_nolit_encodeSnappyBlockAsm8B

two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B:
	MOVL R9, SI
	SHLL $0x02, SI
	CMPL R9, $0x0c
	JAE  emit_copy_three_match_nolit_encodeSnappyBlockAsm8B
	LEAL -15(SI), SI
	MOVB BL, 1(AX)
	SHRL $0x08, BX
	SHLL $0x05, BX
	ORL  BX, SI
	MOVB SI, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeSnappyBlockAsm8B

emit_copy_three_match_nolit_encodeSnappyBlockAsm8B:
	LEAL -2(SI), SI
	MOVB SI, (AX)
	MOVW BX, 1(AX)
	ADDQ $0x03, AX

match_nolit_emitcopy_end_encodeSnappyBlockAsm8B:
	CMPL CX, 8(SP)
	JAE  emit_remainder_encodeSnappyBlockAsm8B
	MOVQ -2(DX)(CX*1), SI
	CMPQ AX, (SP)
	JB   match_nolit_dst_ok_encodeSnappyBlockAsm8B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_nolit_dst_ok_encodeSnappyBlockAsm8B:
	MOVQ  $0x9e3779b1, R8
	MOVQ  SI, DI
	SHRQ  $0x10, SI
	MOVQ  SI, BX
	SHLQ  $0x20, DI
	IMULQ R8, DI
	SHRQ  $0x38, DI
	SHLQ  $0x20, BX
	IMULQ R8, BX
	SHRQ  $0x38, BX
	LEAL  -2(CX), R8
	LEAQ  24(SP)(BX*4), R9
	MOVL  (R9), BX
	MOVL  R8, 24(SP)(DI*4)
	MOVL  CX, (R9)
	CMPL  (DX)(BX*1), SI
	JEQ   match_nolit_loop_encodeSnappyBlockAsm8B
	INCL  CX
	JMP   search_loop_encodeSnappyBlockAsm8B

emit_remainder_encodeSnappyBlockAsm8B:
	MOVQ src_len+32(FP), CX
	SUBL 12(SP), CX
	LEAQ 3(AX)(CX*1), CX
	CMPQ CX, (SP)
	JB   emit_remainder_ok_encodeSnappyBlockAsm8B
	MOVQ $0x00000000, ret+48(FP)
	RET

emit_remainder_ok_encodeSnappyBlockAsm8B:
	MOVQ src_len+32(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JB   one_byte_emit_remainder_encodeSnappyBlockAsm8B
	CMPL DX, $0x00000100
	JB   two_bytes_emit_remainder_encodeSnappyBlockAsm8B
	JB   three_bytes_emit_remainder_encodeSnappyBlockAsm8B

three_bytes_emit_remainder_encodeSnappyBlockAsm8B:
	MOVB $0xf4, (AX)
	MOVW DX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm8B

two_bytes_emit_remainder_encodeSnappyBlockAsm8B:
	MOVB $0xf0, (AX)
	MOVB DL, 1(AX)
	ADDQ $0x02, AX
	CMPL DX, $0x40
	JB   memmove_emit_remainder_encodeSnappyBlockAsm8B
	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm8B

one_byte_emit_remainder_encodeSnappyBlockAsm8B:
	SHLB $0x02, DL
	MOVB DL, (AX)
	ADDQ $0x01, AX

memmove_emit_remainder_encodeSnappyBlockAsm8B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x03
	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2
	JE   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2:
	MOVB (CX), SI
	MOVB -1(CX)(BX*1), CL
	MOVB SI, (AX)
	MOVB CL, -1(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3:
	MOVW (CX), SI
	MOVB 2(CX), CL
	MOVW SI, (AX)
	MOVB CL, 2(AX)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7:
	MOVL (CX), SI
	MOVL -4(CX)(BX*1), CX
	MOVL SI, (AX)
	MOVL CX, -4(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(BX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(BX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)

memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B:
	MOVQ DX, AX
	JMP  emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B

memmove_long_emit_remainder_encodeSnappyBlockAsm8B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)
	MOVQ  DX, AX

emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ AX, ret+48(FP)
	RET

// func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int
// Requires: BMI, SSE2
TEXT ·encodeSnappyBetterBlockAsm(SB), $589848-56
	MOVQ dst_base+0(FP), AX
	MOVQ $0x00001200, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_encodeSnappyBetterBlockAsm:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_encodeSnappyBetterBlockAsm
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), CX
	LEAQ  -9(CX), DX
	LEAQ  -8(CX), BX
	MOVL  BX, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  $0x00000000, 16(SP)
	MOVQ  src_base+24(FP), DX

search_loop_encodeSnappyBetterBlockAsm:
	MOVL CX, BX
	SUBL 12(SP), BX
	SHRL $0x07, BX
	CMPL BX, $0x63
	JBE  check_maxskip_ok_encodeSnappyBetterBlockAsm
	LEAL 100(CX), BX
	JMP  check_maxskip_cont_encodeSnappyBetterBlockAsm

check_maxskip_ok_encodeSnappyBetterBlockAsm:
	LEAL 1(CX)(BX*1), BX

check_maxskip_cont_encodeSnappyBetterBlockAsm:
	CMPL  BX, 8(SP)
	JAE   emit_remainder_encodeSnappyBetterBlockAsm
	MOVQ  (DX)(CX*1), SI
	MOVL  BX, 20(SP)
	MOVQ  $0x00cf1bbcdcbfa563, R8
	MOVQ  $0x9e3779b1, BX
	MOVQ  SI, R9
	MOVQ  SI, R10
	SHLQ  $0x08, R9
	IMULQ R8, R9
	SHRQ  $0x2f, R9
	SHLQ  $0x20, R10
	IMULQ BX, R10
	SHRQ  $0x32, R10
	MOVL  24(SP)(R9*4), BX
	MOVL  524312(SP)(R10*4), DI
	MOVL  CX, 24(SP)(R9*4)
	MOVL  CX, 524312(SP)(R10*4)
	MOVQ  (DX)(BX*1), R9
	MOVQ  (DX)(DI*1), R10
	CMPQ  R9, SI
	JEQ   candidate_match_encodeSnappyBetterBlockAsm
	CMPQ  R10, SI
	JNE   no_short_found_encodeSnappyBetterBlockAsm
	MOVL  DI, BX
	JMP   candidate_match_encodeSnappyBetterBlockAsm

no_short_found_encodeSnappyBetterBlockAsm:
	CMPL R9, SI
	JEQ  candidate_match_encodeSnappyBetterBlockAsm
	CMPL R10, SI
	JEQ  candidateS_match_encodeSnappyBetterBlockAsm
	MOVL 20(SP), CX
	JMP  search_loop_encodeSnappyBetterBlockAsm

candidateS_match_encodeSnappyBetterBlockAsm:
	SHRQ  $0x08, SI
	MOVQ  SI, R9
	SHLQ  $0x08, R9
	IMULQ R8, R9
	SHRQ  $0x2f, R9
	MOVL  24(SP)(R9*4), BX
	INCL  CX
	MOVL  CX, 24(SP)(R9*4)
	CMPL  (DX)(BX*1), SI
	JEQ   candidate_match_encodeSnappyBetterBlockAsm
	DECL  CX
	MOVL  DI, BX

candidate_match_encodeSnappyBetterBlockAsm:
	MOVL  12(SP), SI
	TESTL BX, BX
	JZ    match_extend_back_end_encodeSnappyBetterBlockAsm

match_extend_back_loop_encodeSnappyBetterBlockAsm:
	CMPL CX, SI
	JBE  match_extend_back_end_encodeSnappyBetterBlockAsm
	MOVB -1(DX)(BX*1), DI
	MOVB -1(DX)(CX*1), R8
	CMPB DI, R8
	JNE  match_extend_back_end_encodeSnappyBetterBlockAsm
	LEAL -1(CX), CX
	DECL BX
	JZ   match_extend_back_end_encodeSnappyBetterBlockAsm
	JMP  match_extend_back_loop_encodeSnappyBetterBlockAsm

match_extend_back_end_encodeSnappyBetterBlockAsm:
	MOVL CX, SI
	SUBL 12(SP), SI
	LEAQ 5(AX)(SI*1), SI
	CMPQ SI, (SP)
	JB   match_dst_size_check_encodeSnappyBetterBlockAsm
	MOVQ $0x00000000, ret+48(FP)
	RET

match_dst_size_check_encodeSnappyBetterBlockAsm:
	MOVL CX, SI
	ADDL $0x04, CX
	ADDL $0x04, BX
	MOVQ src_len+32(FP), DI
	SUBL CX, DI
	LEAQ (DX)(CX*1), R8
	LEAQ (DX)(BX*1), R9

	// matchLen
	XORL R11, R11

matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm:
	CMPL DI, $0x10
	JB   matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm
	MOVQ (R8)(R11*1), R10
	MOVQ 8(R8)(R11*1), R12
	XORQ (R9)(R11*1), R10
	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm
	XORQ 8(R9)(R11*1), R12
	JNZ  matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm
	LEAL -16(DI), DI
	LEAL 16(R11), R11
	JMP  matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm

matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm:
#ifdef GOAMD64_v3
	TZCNTQ R12, R12

#else
	BSFQ R12, R12

#endif
	SARQ $0x03, R12
	LEAL 8(R11)(R12*1), R11
	JMP  match_nolit_end_encodeSnappyBetterBlockAsm

matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm:
	CMPL DI, $0x08
	JB   matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm
	MOVQ (R8)(R11*1), R10
	XORQ (R9)(R11*1), R10
	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm
	LEAL -8(DI), DI
	LEAL 8(R11), R11
	JMP  matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm

matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm:
#ifdef GOAMD64_v3
	TZCNTQ R10, R10

#else
	BSFQ R10, R10

#endif
	SARQ $0x03, R10
	LEAL (R11)(R10*1), R11
	JMP  match_nolit_end_encodeSnappyBetterBlockAsm

matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm:
	CMPL DI, $0x04
	JB   matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm
	MOVL (R8)(R11*1), R10
	CMPL (R9)(R11*1), R10
	JNE  matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm
	LEAL -4(DI), DI
	LEAL 4(R11), R11

matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm:
	CMPL DI, $0x01
	JE   matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm
	JB   match_nolit_end_encodeSnappyBetterBlockAsm
	MOVW (R8)(R11*1), R10
	CMPW (R9)(R11*1), R10
	JNE  matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm
	LEAL 2(R11), R11
	SUBL $0x02, DI
	JZ   match_nolit_end_encodeSnappyBetterBlockAsm

matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm:
	MOVB (R8)(R11*1), R10
	CMPB (R9)(R11*1), R10
	JNE  match_nolit_end_encodeSnappyBetterBlockAsm
	LEAL 1(R11), R11

match_nolit_end_encodeSnappyBetterBlockAsm:
	MOVL CX, DI
	SUBL BX, DI

	// Check if repeat
	CMPL R11, $0x01
	JA   match_length_ok_encodeSnappyBetterBlockAsm
	CMPL DI, $0x0000ffff
	JBE  match_length_ok_encodeSnappyBetterBlockAsm
	MOVL 20(SP), CX
	INCL CX
	JMP  search_loop_encodeSnappyBetterBlockAsm

match_length_ok_encodeSnappyBetterBlockAsm:
	MOVL DI, 16(SP)
	MOVL 12(SP), BX
	CMPL BX, SI
	JEQ  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm
	MOVL SI, R8
	MOVL SI, 12(SP)
	LEAQ (DX)(BX*1), R9
	SUBL BX, R8
	LEAL -1(R8), BX
	CMPL BX, $0x3c
	JB   one_byte_match_emit_encodeSnappyBetterBlockAsm
	CMPL BX, $0x00000100
	JB   two_bytes_match_emit_encodeSnappyBetterBlockAsm
	CMPL BX, $0x00010000
	JB   three_bytes_match_emit_encodeSnappyBetterBlockAsm
	CMPL BX, $0x01000000
	JB   four_bytes_match_emit_encodeSnappyBetterBlockAsm
	MOVB $0xfc, (AX)
	MOVL BX, 1(AX)
	ADDQ $0x05, AX
	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm

four_bytes_match_emit_encodeSnappyBetterBlockAsm:
	MOVL BX, R10
	SHRL $0x10, R10
	MOVB $0xf8, (AX)
	MOVW BX, 1(AX)
	MOVB R10, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm

three_bytes_match_emit_encodeSnappyBetterBlockAsm:
	MOVB $0xf4, (AX)
	MOVW BX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm

two_bytes_match_emit_encodeSnappyBetterBlockAsm:
	MOVB $0xf0, (AX)
	MOVB BL, 1(AX)
	ADDQ $0x02, AX
	CMPL BX, $0x40
	JB   memmove_match_emit_encodeSnappyBetterBlockAsm
	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm

one_byte_match_emit_encodeSnappyBetterBlockAsm:
	SHLB $0x02, BL
	MOVB BL, (AX)
	ADDQ $0x01, AX

memmove_match_emit_encodeSnappyBetterBlockAsm:
	LEAQ (AX)(R8*1), BX

	// genMemMoveShort
	CMPQ R8, $0x08
	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8:
	MOVQ (R9), R10
	MOVQ R10, (AX)
	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16:
	MOVQ (R9), R10
	MOVQ -8(R9)(R8*1), R9
	MOVQ R10, (AX)
	MOVQ R9, -8(AX)(R8*1)
	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32:
	MOVOU (R9), X0
	MOVOU -16(R9)(R8*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R8*1)
	JMP   memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64:
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)

memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm:
	MOVQ BX, AX
	JMP  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm

memmove_long_match_emit_encodeSnappyBetterBlockAsm:
	LEAQ (AX)(R8*1), BX

	// genMemMoveLong
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVQ  R8, R12
	SHRQ  $0x05, R12
	MOVQ  AX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R13
	SUBQ  R10, R13
	DECQ  R12
	JA    emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
	LEAQ  -32(R9)(R13*1), R10
	LEAQ  -32(AX)(R13*1), R14

emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R14)
	MOVOA X5, 16(R14)
	ADDQ  $0x20, R14
	ADDQ  $0x20, R10
	ADDQ  $0x20, R13
	DECQ  R12
	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32:
	MOVOU -32(R9)(R13*1), X4
	MOVOU -16(R9)(R13*1), X5
	MOVOA X4, -32(AX)(R13*1)
	MOVOA X5, -16(AX)(R13*1)
	ADDQ  $0x20, R13
	CMPQ  R8, R13
	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)
	MOVQ  BX, AX

emit_literal_done_match_emit_encodeSnappyBetterBlockAsm:
	ADDL R11, CX
	ADDL $0x04, R11
	MOVL CX, 12(SP)

	// emitCopy
	CMPL DI, $0x00010000
	JB   two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm

four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm:
	CMPL R11, $0x40
	JBE  four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm
	MOVB $0xff, (AX)
	MOVL DI, 1(AX)
	LEAL -64(R11), R11
	ADDQ $0x05, AX
	CMPL R11, $0x04
	JB   four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm
	JMP  four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm

four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm:
	TESTL R11, R11
	JZ    match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
	XORL  BX, BX
	LEAL  -1(BX)(R11*4), R11
	MOVB  R11, (AX)
	MOVL  DI, 1(AX)
	ADDQ  $0x05, AX
	JMP   match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm

two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm:
	CMPL R11, $0x40
	JBE  two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm
	MOVB $0xee, (AX)
	MOVW DI, 1(AX)
	LEAL -60(R11), R11
	ADDQ $0x03, AX
	JMP  two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm

two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm:
	MOVL R11, BX
	SHLL $0x02, BX
	CMPL R11, $0x0c
	JAE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm
	CMPL DI, $0x00000800
	JAE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm
	LEAL -15(BX), BX
	MOVB DI, 1(AX)
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, BX
	MOVB BL, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm

emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm:
	LEAL -2(BX), BX
	MOVB BL, (AX)
	MOVW DI, 1(AX)
	ADDQ $0x03, AX

match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm:
	CMPL CX, 8(SP)
	JAE  emit_remainder_encodeSnappyBetterBlockAsm
	CMPQ AX, (SP)
	JB   match_nolit_dst_ok_encodeSnappyBetterBlockAsm
	MOVQ $0x00000000, ret+48(FP)
	RET

match_nolit_dst_ok_encodeSnappyBetterBlockAsm:
	MOVQ  $0x00cf1bbcdcbfa563, BX
	MOVQ  $0x9e3779b1, DI
	LEAQ  1(SI), SI
	LEAQ  -2(CX), R8
	MOVQ  (DX)(SI*1), R9
	MOVQ  1(DX)(SI*1), R10
	MOVQ  (DX)(R8*1), R11
	MOVQ  1(DX)(R8*1), R12
	SHLQ  $0x08, R9
	IMULQ BX, R9
	SHRQ  $0x2f, R9
	SHLQ  $0x20, R10
	IMULQ DI, R10
	SHRQ  $0x32, R10
	SHLQ  $0x08, R11
	IMULQ BX, R11
	SHRQ  $0x2f, R11
	SHLQ  $0x20, R12
	IMULQ DI, R12
	SHRQ  $0x32, R12
	LEAQ  1(SI), DI
	LEAQ  1(R8), R13
	MOVL  SI, 24(SP)(R9*4)
	MOVL  R8, 24(SP)(R11*4)
	MOVL  DI, 524312(SP)(R10*4)
	MOVL  R13, 524312(SP)(R12*4)
	LEAQ  1(R8)(SI*1), DI
	SHRQ  $0x01, DI
	ADDQ  $0x01, SI
	SUBQ  $0x01, R8

index_loop_encodeSnappyBetterBlockAsm:
	CMPQ  DI, R8
	JAE   search_loop_encodeSnappyBetterBlockAsm
	MOVQ  (DX)(SI*1), R9
	MOVQ  (DX)(DI*1), R10
	SHLQ  $0x08, R9
	IMULQ BX, R9
	SHRQ  $0x2f, R9
	SHLQ  $0x08, R10
	IMULQ BX, R10
	SHRQ  $0x2f, R10
	MOVL  SI, 24(SP)(R9*4)
	MOVL  DI, 24(SP)(R10*4)
	ADDQ  $0x02, SI
	ADDQ  $0x02, DI
	JMP   index_loop_encodeSnappyBetterBlockAsm

emit_remainder_encodeSnappyBetterBlockAsm:
	MOVQ src_len+32(FP), CX
	SUBL 12(SP), CX
	LEAQ 5(AX)(CX*1), CX
	CMPQ CX, (SP)
	JB   emit_remainder_ok_encodeSnappyBetterBlockAsm
	MOVQ $0x00000000, ret+48(FP)
	RET

emit_remainder_ok_encodeSnappyBetterBlockAsm:
	MOVQ src_len+32(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JB   one_byte_emit_remainder_encodeSnappyBetterBlockAsm
	CMPL DX, $0x00000100
	JB   two_bytes_emit_remainder_encodeSnappyBetterBlockAsm
	CMPL DX, $0x00010000
	JB   three_bytes_emit_remainder_encodeSnappyBetterBlockAsm
	CMPL DX, $0x01000000
	JB   four_bytes_emit_remainder_encodeSnappyBetterBlockAsm
	MOVB $0xfc, (AX)
	MOVL DX, 1(AX)
	ADDQ $0x05, AX
	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm

four_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
	MOVL DX, BX
	SHRL $0x10, BX
	MOVB $0xf8, (AX)
	MOVW DX, 1(AX)
	MOVB BL, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm

three_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
	MOVB $0xf4, (AX)
	MOVW DX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm

two_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
	MOVB $0xf0, (AX)
	MOVB DL, 1(AX)
	ADDQ $0x02, AX
	CMPL DX, $0x40
	JB   memmove_emit_remainder_encodeSnappyBetterBlockAsm
	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm

one_byte_emit_remainder_encodeSnappyBetterBlockAsm:
	SHLB $0x02, DL
	MOVB DL, (AX)
	ADDQ $0x01, AX

memmove_emit_remainder_encodeSnappyBetterBlockAsm:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x03
	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_1or2
	JE   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_3
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_1or2:
	MOVB (CX), SI
	MOVB -1(CX)(BX*1), CL
	MOVB SI, (AX)
	MOVB CL, -1(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_3:
	MOVW (CX), SI
	MOVB 2(CX), CL
	MOVW SI, (AX)
	MOVB CL, 2(AX)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_4through7:
	MOVL (CX), SI
	MOVL -4(CX)(BX*1), CX
	MOVL SI, (AX)
	MOVL CX, -4(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(BX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(BX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)

memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm:
	MOVQ DX, AX
	JMP  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm

memmove_long_emit_remainder_encodeSnappyBetterBlockAsm:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)
	MOVQ  DX, AX

emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ AX, ret+48(FP)
	RET

// func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte) int
// Requires: BMI, SSE2
TEXT ·encodeSnappyBetterBlockAsm64K(SB), $327704-56
	MOVQ dst_base+0(FP), AX
	MOVQ $0x00000a00, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_encodeSnappyBetterBlockAsm64K:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_encodeSnappyBetterBlockAsm64K
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), CX
	LEAQ  -9(CX), DX
	LEAQ  -8(CX), BX
	MOVL  BX, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  $0x00000000, 16(SP)
	MOVQ  src_base+24(FP), DX

search_loop_encodeSnappyBetterBlockAsm64K:
	MOVL  CX, BX
	SUBL  12(SP), BX
	SHRL  $0x07, BX
	LEAL  1(CX)(BX*1), BX
	CMPL  BX, 8(SP)
	JAE   emit_remainder_encodeSnappyBetterBlockAsm64K
	MOVQ  (DX)(CX*1), SI
	MOVL  BX, 20(SP)
	MOVQ  $0x00cf1bbcdcbfa563, R8
	MOVQ  $0x9e3779b1, BX
	MOVQ  SI, R9
	MOVQ  SI, R10
	SHLQ  $0x08, R9
	IMULQ R8, R9
	SHRQ  $0x30, R9
	SHLQ  $0x20, R10
	IMULQ BX, R10
	SHRQ  $0x32, R10
	MOVL  24(SP)(R9*4), BX
	MOVL  262168(SP)(R10*4), DI
	MOVL  CX, 24(SP)(R9*4)
	MOVL  CX, 262168(SP)(R10*4)
	MOVQ  (DX)(BX*1), R9
	MOVQ  (DX)(DI*1), R10
	CMPQ  R9, SI
	JEQ   candidate_match_encodeSnappyBetterBlockAsm64K
	CMPQ  R10, SI
	JNE   no_short_found_encodeSnappyBetterBlockAsm64K
	MOVL  DI, BX
	JMP   candidate_match_encodeSnappyBetterBlockAsm64K

no_short_found_encodeSnappyBetterBlockAsm64K:
	CMPL R9, SI
	JEQ  candidate_match_encodeSnappyBetterBlockAsm64K
	CMPL R10, SI
	JEQ  candidateS_match_encodeSnappyBetterBlockAsm64K
	MOVL 20(SP), CX
	JMP  search_loop_encodeSnappyBetterBlockAsm64K

candidateS_match_encodeSnappyBetterBlockAsm64K:
	SHRQ  $0x08, SI
	MOVQ  SI, R9
	SHLQ  $0x08, R9
	IMULQ R8, R9
	SHRQ  $0x30, R9
	MOVL  24(SP)(R9*4), BX
	INCL  CX
	MOVL  CX, 24(SP)(R9*4)
	CMPL  (DX)(BX*1), SI
	JEQ   candidate_match_encodeSnappyBetterBlockAsm64K
	DECL  CX
	MOVL  DI, BX

candidate_match_encodeSnappyBetterBlockAsm64K:
	MOVL  12(SP), SI
	TESTL BX, BX
	JZ    match_extend_back_end_encodeSnappyBetterBlockAsm64K

match_extend_back_loop_encodeSnappyBetterBlockAsm64K:
	CMPL CX, SI
	JBE  match_extend_back_end_encodeSnappyBetterBlockAsm64K
	MOVB -1(DX)(BX*1), DI
	MOVB -1(DX)(CX*1), R8
	CMPB DI, R8
	JNE  match_extend_back_end_encodeSnappyBetterBlockAsm64K
	LEAL -1(CX), CX
	DECL BX
	JZ   match_extend_back_end_encodeSnappyBetterBlockAsm64K
	JMP  match_extend_back_loop_encodeSnappyBetterBlockAsm64K

match_extend_back_end_encodeSnappyBetterBlockAsm64K:
	MOVL CX, SI
	SUBL 12(SP), SI
	LEAQ 3(AX)(SI*1), SI
	CMPQ SI, (SP)
	JB   match_dst_size_check_encodeSnappyBetterBlockAsm64K
	MOVQ $0x00000000, ret+48(FP)
	RET

match_dst_size_check_encodeSnappyBetterBlockAsm64K:
	MOVL CX, SI
	ADDL $0x04, CX
	ADDL $0x04, BX
	MOVQ src_len+32(FP), DI
	SUBL CX, DI
	LEAQ (DX)(CX*1), R8
	LEAQ (DX)(BX*1), R9

	// matchLen
	XORL R11, R11

matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm64K:
	CMPL DI, $0x10
	JB   matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm64K
	MOVQ (R8)(R11*1), R10
	MOVQ 8(R8)(R11*1), R12
	XORQ (R9)(R11*1), R10
	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm64K
	XORQ 8(R9)(R11*1), R12
	JNZ  matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm64K
	LEAL -16(DI), DI
	LEAL 16(R11), R11
	JMP  matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm64K

matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm64K:
#ifdef GOAMD64_v3
	TZCNTQ R12, R12

#else
	BSFQ R12, R12

#endif
	SARQ $0x03, R12
	LEAL 8(R11)(R12*1), R11
	JMP  match_nolit_end_encodeSnappyBetterBlockAsm64K

matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm64K:
	CMPL DI, $0x08
	JB   matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K
	MOVQ (R8)(R11*1), R10
	XORQ (R9)(R11*1), R10
	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm64K
	LEAL -8(DI), DI
	LEAL 8(R11), R11
	JMP  matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K

matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm64K:
#ifdef GOAMD64_v3
	TZCNTQ R10, R10

#else
	BSFQ R10, R10

#endif
	SARQ $0x03, R10
	LEAL (R11)(R10*1), R11
	JMP  match_nolit_end_encodeSnappyBetterBlockAsm64K

matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K:
	CMPL DI, $0x04
	JB   matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K
	MOVL (R8)(R11*1), R10
	CMPL (R9)(R11*1), R10
	JNE  matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K
	LEAL -4(DI), DI
	LEAL 4(R11), R11

matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K:
	CMPL DI, $0x01
	JE   matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K
	JB   match_nolit_end_encodeSnappyBetterBlockAsm64K
	MOVW (R8)(R11*1), R10
	CMPW (R9)(R11*1), R10
	JNE  matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K
	LEAL 2(R11), R11
	SUBL $0x02, DI
	JZ   match_nolit_end_encodeSnappyBetterBlockAsm64K

matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K:
	MOVB (R8)(R11*1), R10
	CMPB (R9)(R11*1), R10
	JNE  match_nolit_end_encodeSnappyBetterBlockAsm64K
	LEAL 1(R11), R11

match_nolit_end_encodeSnappyBetterBlockAsm64K:
	MOVL CX, DI
	SUBL BX, DI

	// Check if repeat
	MOVL DI, 16(SP)
	MOVL 12(SP), BX
	CMPL BX, SI
	JEQ  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K
	MOVL SI, R8
	MOVL SI, 12(SP)
	LEAQ (DX)(BX*1), R9
	SUBL BX, R8
	LEAL -1(R8), BX
	CMPL BX, $0x3c
	JB   one_byte_match_emit_encodeSnappyBetterBlockAsm64K
	CMPL BX, $0x00000100
	JB   two_bytes_match_emit_encodeSnappyBetterBlockAsm64K
	JB   three_bytes_match_emit_encodeSnappyBetterBlockAsm64K

three_bytes_match_emit_encodeSnappyBetterBlockAsm64K:
	MOVB $0xf4, (AX)
	MOVW BX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm64K

two_bytes_match_emit_encodeSnappyBetterBlockAsm64K:
	MOVB $0xf0, (AX)
	MOVB BL, 1(AX)
	ADDQ $0x02, AX
	CMPL BX, $0x40
	JB   memmove_match_emit_encodeSnappyBetterBlockAsm64K
	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm64K

one_byte_match_emit_encodeSnappyBetterBlockAsm64K:
	SHLB $0x02, BL
	MOVB BL, (AX)
	ADDQ $0x01, AX

memmove_match_emit_encodeSnappyBetterBlockAsm64K:
	LEAQ (AX)(R8*1), BX

	// genMemMoveShort
	CMPQ R8, $0x08
	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8:
	MOVQ (R9), R10
	MOVQ R10, (AX)
	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16:
	MOVQ (R9), R10
	MOVQ -8(R9)(R8*1), R9
	MOVQ R10, (AX)
	MOVQ R9, -8(AX)(R8*1)
	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32:
	MOVOU (R9), X0
	MOVOU -16(R9)(R8*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R8*1)
	JMP   memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64:
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)

memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K:
	MOVQ BX, AX
	JMP  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K

memmove_long_match_emit_encodeSnappyBetterBlockAsm64K:
	LEAQ (AX)(R8*1), BX

	// genMemMoveLong
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVQ  R8, R12
	SHRQ  $0x05, R12
	MOVQ  AX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R13
	SUBQ  R10, R13
	DECQ  R12
	JA    emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
	LEAQ  -32(R9)(R13*1), R10
	LEAQ  -32(AX)(R13*1), R14

emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R14)
	MOVOA X5, 16(R14)
	ADDQ  $0x20, R14
	ADDQ  $0x20, R10
	ADDQ  $0x20, R13
	DECQ  R12
	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32:
	MOVOU -32(R9)(R13*1), X4
	MOVOU -16(R9)(R13*1), X5
	MOVOA X4, -32(AX)(R13*1)
	MOVOA X5, -16(AX)(R13*1)
	ADDQ  $0x20, R13
	CMPQ  R8, R13
	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)
	MOVQ  BX, AX

emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K:
	ADDL R11, CX
	ADDL $0x04, R11
	MOVL CX, 12(SP)

	// emitCopy
two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K:
	CMPL R11, $0x40
	JBE  two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K
	MOVB $0xee, (AX)
	MOVW DI, 1(AX)
	LEAL -60(R11), R11
	ADDQ $0x03, AX
	JMP  two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K

two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K:
	MOVL R11, BX
	SHLL $0x02, BX
	CMPL R11, $0x0c
	JAE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K
	CMPL DI, $0x00000800
	JAE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K
	LEAL -15(BX), BX
	MOVB DI, 1(AX)
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, BX
	MOVB BL, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K

emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K:
	LEAL -2(BX), BX
	MOVB BL, (AX)
	MOVW DI, 1(AX)
	ADDQ $0x03, AX

match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K:
	CMPL CX, 8(SP)
	JAE  emit_remainder_encodeSnappyBetterBlockAsm64K
	CMPQ AX, (SP)
	JB   match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K
	MOVQ $0x00000000, ret+48(FP)
	RET

match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K:
	MOVQ  $0x00cf1bbcdcbfa563, BX
	MOVQ  $0x9e3779b1, DI
	LEAQ  1(SI), SI
	LEAQ  -2(CX), R8
	MOVQ  (DX)(SI*1), R9
	MOVQ  1(DX)(SI*1), R10
	MOVQ  (DX)(R8*1), R11
	MOVQ  1(DX)(R8*1), R12
	SHLQ  $0x08, R9
	IMULQ BX, R9
	SHRQ  $0x30, R9
	SHLQ  $0x20, R10
	IMULQ DI, R10
	SHRQ  $0x32, R10
	SHLQ  $0x08, R11
	IMULQ BX, R11
	SHRQ  $0x30, R11
	SHLQ  $0x20, R12
	IMULQ DI, R12
	SHRQ  $0x32, R12
	LEAQ  1(SI), DI
	LEAQ  1(R8), R13
	MOVL  SI, 24(SP)(R9*4)
	MOVL  R8, 24(SP)(R11*4)
	MOVL  DI, 262168(SP)(R10*4)
	MOVL  R13, 262168(SP)(R12*4)
	LEAQ  1(R8)(SI*1), DI
	SHRQ  $0x01, DI
	ADDQ  $0x01, SI
	SUBQ  $0x01, R8

index_loop_encodeSnappyBetterBlockAsm64K:
	CMPQ  DI, R8
	JAE   search_loop_encodeSnappyBetterBlockAsm64K
	MOVQ  (DX)(SI*1), R9
	MOVQ  (DX)(DI*1), R10
	SHLQ  $0x08, R9
	IMULQ BX, R9
	SHRQ  $0x30, R9
	SHLQ  $0x08, R10
	IMULQ BX, R10
	SHRQ  $0x30, R10
	MOVL  SI, 24(SP)(R9*4)
	MOVL  DI, 24(SP)(R10*4)
	ADDQ  $0x02, SI
	ADDQ  $0x02, DI
	JMP   index_loop_encodeSnappyBetterBlockAsm64K

emit_remainder_encodeSnappyBetterBlockAsm64K:
	MOVQ src_len+32(FP), CX
	SUBL 12(SP), CX
	LEAQ 3(AX)(CX*1), CX
	CMPQ CX, (SP)
	JB   emit_remainder_ok_encodeSnappyBetterBlockAsm64K
	MOVQ $0x00000000, ret+48(FP)
	RET

emit_remainder_ok_encodeSnappyBetterBlockAsm64K:
	MOVQ src_len+32(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JB   one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K
	CMPL DX, $0x00000100
	JB   two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K
	JB   three_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K

three_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K:
	MOVB $0xf4, (AX)
	MOVW DX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K

two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K:
	MOVB $0xf0, (AX)
	MOVB DL, 1(AX)
	ADDQ $0x02, AX
	CMPL DX, $0x40
	JB   memmove_emit_remainder_encodeSnappyBetterBlockAsm64K
	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K

one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K:
	SHLB $0x02, DL
	MOVB DL, (AX)
	ADDQ $0x01, AX

memmove_emit_remainder_encodeSnappyBetterBlockAsm64K:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x03
	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_1or2
	JE   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_3
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_1or2:
	MOVB (CX), SI
	MOVB -1(CX)(BX*1), CL
	MOVB SI, (AX)
	MOVB CL, -1(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_3:
	MOVW (CX), SI
	MOVB 2(CX), CL
	MOVW SI, (AX)
	MOVB CL, 2(AX)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_4through7:
	MOVL (CX), SI
	MOVL -4(CX)(BX*1), CX
	MOVL SI, (AX)
	MOVL CX, -4(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(BX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(BX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)

memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K:
	MOVQ DX, AX
	JMP  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K

memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)
	MOVQ  DX, AX

emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ AX, ret+48(FP)
	RET

// func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte) int
// Requires: BMI, SSE2
TEXT ·encodeSnappyBetterBlockAsm12B(SB), $81944-56
	MOVQ dst_base+0(FP), AX
	MOVQ $0x00000280, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_encodeSnappyBetterBlockAsm12B:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_encodeSnappyBetterBlockAsm12B
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), CX
	LEAQ  -9(CX), DX
	LEAQ  -8(CX), BX
	MOVL  BX, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  $0x00000000, 16(SP)
	MOVQ  src_base+24(FP), DX

search_loop_encodeSnappyBetterBlockAsm12B:
	MOVL  CX, BX
	SUBL  12(SP), BX
	SHRL  $0x06, BX
	LEAL  1(CX)(BX*1), BX
	CMPL  BX, 8(SP)
	JAE   emit_remainder_encodeSnappyBetterBlockAsm12B
	MOVQ  (DX)(CX*1), SI
	MOVL  BX, 20(SP)
	MOVQ  $0x0000cf1bbcdcbf9b, R8
	MOVQ  $0x9e3779b1, BX
	MOVQ  SI, R9
	MOVQ  SI, R10
	SHLQ  $0x10, R9
	IMULQ R8, R9
	SHRQ  $0x32, R9
	SHLQ  $0x20, R10
	IMULQ BX, R10
	SHRQ  $0x34, R10
	MOVL  24(SP)(R9*4), BX
	MOVL  65560(SP)(R10*4), DI
	MOVL  CX, 24(SP)(R9*4)
	MOVL  CX, 65560(SP)(R10*4)
	MOVQ  (DX)(BX*1), R9
	MOVQ  (DX)(DI*1), R10
	CMPQ  R9, SI
	JEQ   candidate_match_encodeSnappyBetterBlockAsm12B
	CMPQ  R10, SI
	JNE   no_short_found_encodeSnappyBetterBlockAsm12B
	MOVL  DI, BX
	JMP   candidate_match_encodeSnappyBetterBlockAsm12B

no_short_found_encodeSnappyBetterBlockAsm12B:
	CMPL R9, SI
	JEQ  candidate_match_encodeSnappyBetterBlockAsm12B
	CMPL R10, SI
	JEQ  candidateS_match_encodeSnappyBetterBlockAsm12B
	MOVL 20(SP), CX
	JMP  search_loop_encodeSnappyBetterBlockAsm12B

candidateS_match_encodeSnappyBetterBlockAsm12B:
	SHRQ  $0x08, SI
	MOVQ  SI, R9
	SHLQ  $0x10, R9
	IMULQ R8, R9
	SHRQ  $0x32, R9
	MOVL  24(SP)(R9*4), BX
	INCL  CX
	MOVL  CX, 24(SP)(R9*4)
	CMPL  (DX)(BX*1), SI
	JEQ   candidate_match_encodeSnappyBetterBlockAsm12B
	DECL  CX
	MOVL  DI, BX

candidate_match_encodeSnappyBetterBlockAsm12B:
	MOVL  12(SP), SI
	TESTL BX, BX
	JZ    match_extend_back_end_encodeSnappyBetterBlockAsm12B

match_extend_back_loop_encodeSnappyBetterBlockAsm12B:
	CMPL CX, SI
	JBE  match_extend_back_end_encodeSnappyBetterBlockAsm12B
	MOVB -1(DX)(BX*1), DI
	MOVB -1(DX)(CX*1), R8
	CMPB DI, R8
	JNE  match_extend_back_end_encodeSnappyBetterBlockAsm12B
	LEAL -1(CX), CX
	DECL BX
	JZ   match_extend_back_end_encodeSnappyBetterBlockAsm12B
	JMP  match_extend_back_loop_encodeSnappyBetterBlockAsm12B

match_extend_back_end_encodeSnappyBetterBlockAsm12B:
	MOVL CX, SI
	SUBL 12(SP), SI
	LEAQ 3(AX)(SI*1), SI
	CMPQ SI, (SP)
	JB   match_dst_size_check_encodeSnappyBetterBlockAsm12B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_dst_size_check_encodeSnappyBetterBlockAsm12B:
	MOVL CX, SI
	ADDL $0x04, CX
	ADDL $0x04, BX
	MOVQ src_len+32(FP), DI
	SUBL CX, DI
	LEAQ (DX)(CX*1), R8
	LEAQ (DX)(BX*1), R9

	// matchLen
	XORL R11, R11

matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm12B:
	CMPL DI, $0x10
	JB   matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm12B
	MOVQ (R8)(R11*1), R10
	MOVQ 8(R8)(R11*1), R12
	XORQ (R9)(R11*1), R10
	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm12B
	XORQ 8(R9)(R11*1), R12
	JNZ  matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm12B
	LEAL -16(DI), DI
	LEAL 16(R11), R11
	JMP  matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm12B

matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm12B:
#ifdef GOAMD64_v3
	TZCNTQ R12, R12

#else
	BSFQ R12, R12

#endif
	SARQ $0x03, R12
	LEAL 8(R11)(R12*1), R11
	JMP  match_nolit_end_encodeSnappyBetterBlockAsm12B

matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm12B:
	CMPL DI, $0x08
	JB   matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B
	MOVQ (R8)(R11*1), R10
	XORQ (R9)(R11*1), R10
	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm12B
	LEAL -8(DI), DI
	LEAL 8(R11), R11
	JMP  matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B

matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm12B:
#ifdef GOAMD64_v3
	TZCNTQ R10, R10

#else
	BSFQ R10, R10

#endif
	SARQ $0x03, R10
	LEAL (R11)(R10*1), R11
	JMP  match_nolit_end_encodeSnappyBetterBlockAsm12B

matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B:
	CMPL DI, $0x04
	JB   matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B
	MOVL (R8)(R11*1), R10
	CMPL (R9)(R11*1), R10
	JNE  matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B
	LEAL -4(DI), DI
	LEAL 4(R11), R11

matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B:
	CMPL DI, $0x01
	JE   matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B
	JB   match_nolit_end_encodeSnappyBetterBlockAsm12B
	MOVW (R8)(R11*1), R10
	CMPW (R9)(R11*1), R10
	JNE  matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B
	LEAL 2(R11), R11
	SUBL $0x02, DI
	JZ   match_nolit_end_encodeSnappyBetterBlockAsm12B

matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B:
	MOVB (R8)(R11*1), R10
	CMPB (R9)(R11*1), R10
	JNE  match_nolit_end_encodeSnappyBetterBlockAsm12B
	LEAL 1(R11), R11

match_nolit_end_encodeSnappyBetterBlockAsm12B:
	MOVL CX, DI
	SUBL BX, DI

	// Check if repeat
	MOVL DI, 16(SP)
	MOVL 12(SP), BX
	CMPL BX, SI
	JEQ  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B
	MOVL SI, R8
	MOVL SI, 12(SP)
	LEAQ (DX)(BX*1), R9
	SUBL BX, R8
	LEAL -1(R8), BX
	CMPL BX, $0x3c
	JB   one_byte_match_emit_encodeSnappyBetterBlockAsm12B
	CMPL BX, $0x00000100
	JB   two_bytes_match_emit_encodeSnappyBetterBlockAsm12B
	JB   three_bytes_match_emit_encodeSnappyBetterBlockAsm12B

three_bytes_match_emit_encodeSnappyBetterBlockAsm12B:
	MOVB $0xf4, (AX)
	MOVW BX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm12B

two_bytes_match_emit_encodeSnappyBetterBlockAsm12B:
	MOVB $0xf0, (AX)
	MOVB BL, 1(AX)
	ADDQ $0x02, AX
	CMPL BX, $0x40
	JB   memmove_match_emit_encodeSnappyBetterBlockAsm12B
	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm12B

one_byte_match_emit_encodeSnappyBetterBlockAsm12B:
	SHLB $0x02, BL
	MOVB BL, (AX)
	ADDQ $0x01, AX

memmove_match_emit_encodeSnappyBetterBlockAsm12B:
	LEAQ (AX)(R8*1), BX

	// genMemMoveShort
	CMPQ R8, $0x08
	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8:
	MOVQ (R9), R10
	MOVQ R10, (AX)
	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16:
	MOVQ (R9), R10
	MOVQ -8(R9)(R8*1), R9
	MOVQ R10, (AX)
	MOVQ R9, -8(AX)(R8*1)
	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32:
	MOVOU (R9), X0
	MOVOU -16(R9)(R8*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R8*1)
	JMP   memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64:
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)

memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B:
	MOVQ BX, AX
	JMP  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B

memmove_long_match_emit_encodeSnappyBetterBlockAsm12B:
	LEAQ (AX)(R8*1), BX

	// genMemMoveLong
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVQ  R8, R12
	SHRQ  $0x05, R12
	MOVQ  AX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R13
	SUBQ  R10, R13
	DECQ  R12
	JA    emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
	LEAQ  -32(R9)(R13*1), R10
	LEAQ  -32(AX)(R13*1), R14

emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R14)
	MOVOA X5, 16(R14)
	ADDQ  $0x20, R14
	ADDQ  $0x20, R10
	ADDQ  $0x20, R13
	DECQ  R12
	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32:
	MOVOU -32(R9)(R13*1), X4
	MOVOU -16(R9)(R13*1), X5
	MOVOA X4, -32(AX)(R13*1)
	MOVOA X5, -16(AX)(R13*1)
	ADDQ  $0x20, R13
	CMPQ  R8, R13
	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)
	MOVQ  BX, AX

emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B:
	ADDL R11, CX
	ADDL $0x04, R11
	MOVL CX, 12(SP)

	// emitCopy
two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B:
	CMPL R11, $0x40
	JBE  two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B
	MOVB $0xee, (AX)
	MOVW DI, 1(AX)
	LEAL -60(R11), R11
	ADDQ $0x03, AX
	JMP  two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B

two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B:
	MOVL R11, BX
	SHLL $0x02, BX
	CMPL R11, $0x0c
	JAE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B
	CMPL DI, $0x00000800
	JAE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B
	LEAL -15(BX), BX
	MOVB DI, 1(AX)
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, BX
	MOVB BL, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B

emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B:
	LEAL -2(BX), BX
	MOVB BL, (AX)
	MOVW DI, 1(AX)
	ADDQ $0x03, AX

match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B:
	CMPL CX, 8(SP)
	JAE  emit_remainder_encodeSnappyBetterBlockAsm12B
	CMPQ AX, (SP)
	JB   match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B:
	MOVQ  $0x0000cf1bbcdcbf9b, BX
	MOVQ  $0x9e3779b1, DI
	LEAQ  1(SI), SI
	LEAQ  -2(CX), R8
	MOVQ  (DX)(SI*1), R9
	MOVQ  1(DX)(SI*1), R10
	MOVQ  (DX)(R8*1), R11
	MOVQ  1(DX)(R8*1), R12
	SHLQ  $0x10, R9
	IMULQ BX, R9
	SHRQ  $0x32, R9
	SHLQ  $0x20, R10
	IMULQ DI, R10
	SHRQ  $0x34, R10
	SHLQ  $0x10, R11
	IMULQ BX, R11
	SHRQ  $0x32, R11
	SHLQ  $0x20, R12
	IMULQ DI, R12
	SHRQ  $0x34, R12
	LEAQ  1(SI), DI
	LEAQ  1(R8), R13
	MOVL  SI, 24(SP)(R9*4)
	MOVL  R8, 24(SP)(R11*4)
	MOVL  DI, 65560(SP)(R10*4)
	MOVL  R13, 65560(SP)(R12*4)
	LEAQ  1(R8)(SI*1), DI
	SHRQ  $0x01, DI
	ADDQ  $0x01, SI
	SUBQ  $0x01, R8

index_loop_encodeSnappyBetterBlockAsm12B:
	CMPQ  DI, R8
	JAE   search_loop_encodeSnappyBetterBlockAsm12B
	MOVQ  (DX)(SI*1), R9
	MOVQ  (DX)(DI*1), R10
	SHLQ  $0x10, R9
	IMULQ BX, R9
	SHRQ  $0x32, R9
	SHLQ  $0x10, R10
	IMULQ BX, R10
	SHRQ  $0x32, R10
	MOVL  SI, 24(SP)(R9*4)
	MOVL  DI, 24(SP)(R10*4)
	ADDQ  $0x02, SI
	ADDQ  $0x02, DI
	JMP   index_loop_encodeSnappyBetterBlockAsm12B

emit_remainder_encodeSnappyBetterBlockAsm12B:
	MOVQ src_len+32(FP), CX
	SUBL 12(SP), CX
	LEAQ 3(AX)(CX*1), CX
	CMPQ CX, (SP)
	JB   emit_remainder_ok_encodeSnappyBetterBlockAsm12B
	MOVQ $0x00000000, ret+48(FP)
	RET

emit_remainder_ok_encodeSnappyBetterBlockAsm12B:
	MOVQ src_len+32(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JB   one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B
	CMPL DX, $0x00000100
	JB   two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B
	JB   three_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B

three_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B:
	MOVB $0xf4, (AX)
	MOVW DX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B

two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B:
	MOVB $0xf0, (AX)
	MOVB DL, 1(AX)
	ADDQ $0x02, AX
	CMPL DX, $0x40
	JB   memmove_emit_remainder_encodeSnappyBetterBlockAsm12B
	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B

one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B:
	SHLB $0x02, DL
	MOVB DL, (AX)
	ADDQ $0x01, AX

memmove_emit_remainder_encodeSnappyBetterBlockAsm12B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x03
	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_1or2
	JE   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_3
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_1or2:
	MOVB (CX), SI
	MOVB -1(CX)(BX*1), CL
	MOVB SI, (AX)
	MOVB CL, -1(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_3:
	MOVW (CX), SI
	MOVB 2(CX), CL
	MOVW SI, (AX)
	MOVB CL, 2(AX)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_4through7:
	MOVL (CX), SI
	MOVL -4(CX)(BX*1), CX
	MOVL SI, (AX)
	MOVL CX, -4(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(BX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(BX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)

memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B:
	MOVQ DX, AX
	JMP  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B

memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)
	MOVQ  DX, AX

emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ AX, ret+48(FP)
	RET

// func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte) int
// Requires: BMI, SSE2
TEXT ·encodeSnappyBetterBlockAsm10B(SB), $20504-56
	MOVQ dst_base+0(FP), AX
	MOVQ $0x000000a0, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_encodeSnappyBetterBlockAsm10B:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_encodeSnappyBetterBlockAsm10B
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), CX
	LEAQ  -9(CX), DX
	LEAQ  -8(CX), BX
	MOVL  BX, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  $0x00000000, 16(SP)
	MOVQ  src_base+24(FP), DX

search_loop_encodeSnappyBetterBlockAsm10B:
	MOVL  CX, BX
	SUBL  12(SP), BX
	SHRL  $0x05, BX
	LEAL  1(CX)(BX*1), BX
	CMPL  BX, 8(SP)
	JAE   emit_remainder_encodeSnappyBetterBlockAsm10B
	MOVQ  (DX)(CX*1), SI
	MOVL  BX, 20(SP)
	MOVQ  $0x0000cf1bbcdcbf9b, R8
	MOVQ  $0x9e3779b1, BX
	MOVQ  SI, R9
	MOVQ  SI, R10
	SHLQ  $0x10, R9
	IMULQ R8, R9
	SHRQ  $0x34, R9
	SHLQ  $0x20, R10
	IMULQ BX, R10
	SHRQ  $0x36, R10
	MOVL  24(SP)(R9*4), BX
	MOVL  16408(SP)(R10*4), DI
	MOVL  CX, 24(SP)(R9*4)
	MOVL  CX, 16408(SP)(R10*4)
	MOVQ  (DX)(BX*1), R9
	MOVQ  (DX)(DI*1), R10
	CMPQ  R9, SI
	JEQ   candidate_match_encodeSnappyBetterBlockAsm10B
	CMPQ  R10, SI
	JNE   no_short_found_encodeSnappyBetterBlockAsm10B
	MOVL  DI, BX
	JMP   candidate_match_encodeSnappyBetterBlockAsm10B

no_short_found_encodeSnappyBetterBlockAsm10B:
	CMPL R9, SI
	JEQ  candidate_match_encodeSnappyBetterBlockAsm10B
	CMPL R10, SI
	JEQ  candidateS_match_encodeSnappyBetterBlockAsm10B
	MOVL 20(SP), CX
	JMP  search_loop_encodeSnappyBetterBlockAsm10B

candidateS_match_encodeSnappyBetterBlockAsm10B:
	SHRQ  $0x08, SI
	MOVQ  SI, R9
	SHLQ  $0x10, R9
	IMULQ R8, R9
	SHRQ  $0x34, R9
	MOVL  24(SP)(R9*4), BX
	INCL  CX
	MOVL  CX, 24(SP)(R9*4)
	CMPL  (DX)(BX*1), SI
	JEQ   candidate_match_encodeSnappyBetterBlockAsm10B
	DECL  CX
	MOVL  DI, BX

candidate_match_encodeSnappyBetterBlockAsm10B:
	MOVL  12(SP), SI
	TESTL BX, BX
	JZ    match_extend_back_end_encodeSnappyBetterBlockAsm10B

match_extend_back_loop_encodeSnappyBetterBlockAsm10B:
	CMPL CX, SI
	JBE  match_extend_back_end_encodeSnappyBetterBlockAsm10B
	MOVB -1(DX)(BX*1), DI
	MOVB -1(DX)(CX*1), R8
	CMPB DI, R8
	JNE  match_extend_back_end_encodeSnappyBetterBlockAsm10B
	LEAL -1(CX), CX
	DECL BX
	JZ   match_extend_back_end_encodeSnappyBetterBlockAsm10B
	JMP  match_extend_back_loop_encodeSnappyBetterBlockAsm10B

match_extend_back_end_encodeSnappyBetterBlockAsm10B:
	MOVL CX, SI
	SUBL 12(SP), SI
	LEAQ 3(AX)(SI*1), SI
	CMPQ SI, (SP)
	JB   match_dst_size_check_encodeSnappyBetterBlockAsm10B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_dst_size_check_encodeSnappyBetterBlockAsm10B:
	MOVL CX, SI
	ADDL $0x04, CX
	ADDL $0x04, BX
	MOVQ src_len+32(FP), DI
	SUBL CX, DI
	LEAQ (DX)(CX*1), R8
	LEAQ (DX)(BX*1), R9

	// matchLen
	XORL R11, R11

matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm10B:
	CMPL DI, $0x10
	JB   matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm10B
	MOVQ (R8)(R11*1), R10
	MOVQ 8(R8)(R11*1), R12
	XORQ (R9)(R11*1), R10
	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm10B
	XORQ 8(R9)(R11*1), R12
	JNZ  matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm10B
	LEAL -16(DI), DI
	LEAL 16(R11), R11
	JMP  matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm10B

matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm10B:
#ifdef GOAMD64_v3
	TZCNTQ R12, R12

#else
	BSFQ R12, R12

#endif
	SARQ $0x03, R12
	LEAL 8(R11)(R12*1), R11
	JMP  match_nolit_end_encodeSnappyBetterBlockAsm10B

matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm10B:
	CMPL DI, $0x08
	JB   matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B
	MOVQ (R8)(R11*1), R10
	XORQ (R9)(R11*1), R10
	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm10B
	LEAL -8(DI), DI
	LEAL 8(R11), R11
	JMP  matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B

matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm10B:
#ifdef GOAMD64_v3
	TZCNTQ R10, R10

#else
	BSFQ R10, R10

#endif
	SARQ $0x03, R10
	LEAL (R11)(R10*1), R11
	JMP  match_nolit_end_encodeSnappyBetterBlockAsm10B

matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B:
	CMPL DI, $0x04
	JB   matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B
	MOVL (R8)(R11*1), R10
	CMPL (R9)(R11*1), R10
	JNE  matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B
	LEAL -4(DI), DI
	LEAL 4(R11), R11

matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B:
	CMPL DI, $0x01
	JE   matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B
	JB   match_nolit_end_encodeSnappyBetterBlockAsm10B
	MOVW (R8)(R11*1), R10
	CMPW (R9)(R11*1), R10
	JNE  matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B
	LEAL 2(R11), R11
	SUBL $0x02, DI
	JZ   match_nolit_end_encodeSnappyBetterBlockAsm10B

matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B:
	MOVB (R8)(R11*1), R10
	CMPB (R9)(R11*1), R10
	JNE  match_nolit_end_encodeSnappyBetterBlockAsm10B
	LEAL 1(R11), R11

match_nolit_end_encodeSnappyBetterBlockAsm10B:
	MOVL CX, DI
	SUBL BX, DI

	// Check if repeat
	MOVL DI, 16(SP)
	MOVL 12(SP), BX
	CMPL BX, SI
	JEQ  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B
	MOVL SI, R8
	MOVL SI, 12(SP)
	LEAQ (DX)(BX*1), R9
	SUBL BX, R8
	LEAL -1(R8), BX
	CMPL BX, $0x3c
	JB   one_byte_match_emit_encodeSnappyBetterBlockAsm10B
	CMPL BX, $0x00000100
	JB   two_bytes_match_emit_encodeSnappyBetterBlockAsm10B
	JB   three_bytes_match_emit_encodeSnappyBetterBlockAsm10B

three_bytes_match_emit_encodeSnappyBetterBlockAsm10B:
	MOVB $0xf4, (AX)
	MOVW BX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm10B

two_bytes_match_emit_encodeSnappyBetterBlockAsm10B:
	MOVB $0xf0, (AX)
	MOVB BL, 1(AX)
	ADDQ $0x02, AX
	CMPL BX, $0x40
	JB   memmove_match_emit_encodeSnappyBetterBlockAsm10B
	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm10B

one_byte_match_emit_encodeSnappyBetterBlockAsm10B:
	SHLB $0x02, BL
	MOVB BL, (AX)
	ADDQ $0x01, AX

memmove_match_emit_encodeSnappyBetterBlockAsm10B:
	LEAQ (AX)(R8*1), BX

	// genMemMoveShort
	CMPQ R8, $0x08
	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8:
	MOVQ (R9), R10
	MOVQ R10, (AX)
	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16:
	MOVQ (R9), R10
	MOVQ -8(R9)(R8*1), R9
	MOVQ R10, (AX)
	MOVQ R9, -8(AX)(R8*1)
	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32:
	MOVOU (R9), X0
	MOVOU -16(R9)(R8*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R8*1)
	JMP   memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64:
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)

memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B:
	MOVQ BX, AX
	JMP  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B

memmove_long_match_emit_encodeSnappyBetterBlockAsm10B:
	LEAQ (AX)(R8*1), BX

	// genMemMoveLong
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVQ  R8, R12
	SHRQ  $0x05, R12
	MOVQ  AX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R13
	SUBQ  R10, R13
	DECQ  R12
	JA    emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
	LEAQ  -32(R9)(R13*1), R10
	LEAQ  -32(AX)(R13*1), R14

emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R14)
	MOVOA X5, 16(R14)
	ADDQ  $0x20, R14
	ADDQ  $0x20, R10
	ADDQ  $0x20, R13
	DECQ  R12
	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32:
	MOVOU -32(R9)(R13*1), X4
	MOVOU -16(R9)(R13*1), X5
	MOVOA X4, -32(AX)(R13*1)
	MOVOA X5, -16(AX)(R13*1)
	ADDQ  $0x20, R13
	CMPQ  R8, R13
	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)
	MOVQ  BX, AX

emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B:
	ADDL R11, CX
	ADDL $0x04, R11
	MOVL CX, 12(SP)

	// emitCopy
two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B:
	CMPL R11, $0x40
	JBE  two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B
	MOVB $0xee, (AX)
	MOVW DI, 1(AX)
	LEAL -60(R11), R11
	ADDQ $0x03, AX
	JMP  two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B

two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B:
	MOVL R11, BX
	SHLL $0x02, BX
	CMPL R11, $0x0c
	JAE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B
	CMPL DI, $0x00000800
	JAE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B
	LEAL -15(BX), BX
	MOVB DI, 1(AX)
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, BX
	MOVB BL, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B

emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B:
	LEAL -2(BX), BX
	MOVB BL, (AX)
	MOVW DI, 1(AX)
	ADDQ $0x03, AX

match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B:
	CMPL CX, 8(SP)
	JAE  emit_remainder_encodeSnappyBetterBlockAsm10B
	CMPQ AX, (SP)
	JB   match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B:
	MOVQ  $0x0000cf1bbcdcbf9b, BX
	MOVQ  $0x9e3779b1, DI
	LEAQ  1(SI), SI
	LEAQ  -2(CX), R8
	MOVQ  (DX)(SI*1), R9
	MOVQ  1(DX)(SI*1), R10
	MOVQ  (DX)(R8*1), R11
	MOVQ  1(DX)(R8*1), R12
	SHLQ  $0x10, R9
	IMULQ BX, R9
	SHRQ  $0x34, R9
	SHLQ  $0x20, R10
	IMULQ DI, R10
	SHRQ  $0x36, R10
	SHLQ  $0x10, R11
	IMULQ BX, R11
	SHRQ  $0x34, R11
	SHLQ  $0x20, R12
	IMULQ DI, R12
	SHRQ  $0x36, R12
	LEAQ  1(SI), DI
	LEAQ  1(R8), R13
	MOVL  SI, 24(SP)(R9*4)
	MOVL  R8, 24(SP)(R11*4)
	MOVL  DI, 16408(SP)(R10*4)
	MOVL  R13, 16408(SP)(R12*4)
	LEAQ  1(R8)(SI*1), DI
	SHRQ  $0x01, DI
	ADDQ  $0x01, SI
	SUBQ  $0x01, R8

index_loop_encodeSnappyBetterBlockAsm10B:
	CMPQ  DI, R8
	JAE   search_loop_encodeSnappyBetterBlockAsm10B
	MOVQ  (DX)(SI*1), R9
	MOVQ  (DX)(DI*1), R10
	SHLQ  $0x10, R9
	IMULQ BX, R9
	SHRQ  $0x34, R9
	SHLQ  $0x10, R10
	IMULQ BX, R10
	SHRQ  $0x34, R10
	MOVL  SI, 24(SP)(R9*4)
	MOVL  DI, 24(SP)(R10*4)
	ADDQ  $0x02, SI
	ADDQ  $0x02, DI
	JMP   index_loop_encodeSnappyBetterBlockAsm10B

emit_remainder_encodeSnappyBetterBlockAsm10B:
	MOVQ src_len+32(FP), CX
	SUBL 12(SP), CX
	LEAQ 3(AX)(CX*1), CX
	CMPQ CX, (SP)
	JB   emit_remainder_ok_encodeSnappyBetterBlockAsm10B
	MOVQ $0x00000000, ret+48(FP)
	RET

emit_remainder_ok_encodeSnappyBetterBlockAsm10B:
	MOVQ src_len+32(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JB   one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B
	CMPL DX, $0x00000100
	JB   two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B
	JB   three_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B

three_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B:
	MOVB $0xf4, (AX)
	MOVW DX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B

two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B:
	MOVB $0xf0, (AX)
	MOVB DL, 1(AX)
	ADDQ $0x02, AX
	CMPL DX, $0x40
	JB   memmove_emit_remainder_encodeSnappyBetterBlockAsm10B
	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B

one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B:
	SHLB $0x02, DL
	MOVB DL, (AX)
	ADDQ $0x01, AX

memmove_emit_remainder_encodeSnappyBetterBlockAsm10B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x03
	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_1or2
	JE   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_3
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_1or2:
	MOVB (CX), SI
	MOVB -1(CX)(BX*1), CL
	MOVB SI, (AX)
	MOVB CL, -1(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_3:
	MOVW (CX), SI
	MOVB 2(CX), CL
	MOVW SI, (AX)
	MOVB CL, 2(AX)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_4through7:
	MOVL (CX), SI
	MOVL -4(CX)(BX*1), CX
	MOVL SI, (AX)
	MOVL CX, -4(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(BX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(BX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)

memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B:
	MOVQ DX, AX
	JMP  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B

memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)
	MOVQ  DX, AX

emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ AX, ret+48(FP)
	RET

// func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int
// Requires: BMI, SSE2
TEXT ·encodeSnappyBetterBlockAsm8B(SB), $5144-56
	MOVQ dst_base+0(FP), AX
	MOVQ $0x00000028, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_encodeSnappyBetterBlockAsm8B:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_encodeSnappyBetterBlockAsm8B
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), CX
	LEAQ  -9(CX), DX
	LEAQ  -8(CX), BX
	MOVL  BX, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  $0x00000000, 16(SP)
	MOVQ  src_base+24(FP), DX

search_loop_encodeSnappyBetterBlockAsm8B:
	MOVL  CX, BX
	SUBL  12(SP), BX
	SHRL  $0x04, BX
	LEAL  1(CX)(BX*1), BX
	CMPL  BX, 8(SP)
	JAE   emit_remainder_encodeSnappyBetterBlockAsm8B
	MOVQ  (DX)(CX*1), SI
	MOVL  BX, 20(SP)
	MOVQ  $0x0000cf1bbcdcbf9b, R8
	MOVQ  $0x9e3779b1, BX
	MOVQ  SI, R9
	MOVQ  SI, R10
	SHLQ  $0x10, R9
	IMULQ R8, R9
	SHRQ  $0x36, R9
	SHLQ  $0x20, R10
	IMULQ BX, R10
	SHRQ  $0x38, R10
	MOVL  24(SP)(R9*4), BX
	MOVL  4120(SP)(R10*4), DI
	MOVL  CX, 24(SP)(R9*4)
	MOVL  CX, 4120(SP)(R10*4)
	MOVQ  (DX)(BX*1), R9
	MOVQ  (DX)(DI*1), R10
	CMPQ  R9, SI
	JEQ   candidate_match_encodeSnappyBetterBlockAsm8B
	CMPQ  R10, SI
	JNE   no_short_found_encodeSnappyBetterBlockAsm8B
	MOVL  DI, BX
	JMP   candidate_match_encodeSnappyBetterBlockAsm8B

no_short_found_encodeSnappyBetterBlockAsm8B:
	CMPL R9, SI
	JEQ  candidate_match_encodeSnappyBetterBlockAsm8B
	CMPL R10, SI
	JEQ  candidateS_match_encodeSnappyBetterBlockAsm8B
	MOVL 20(SP), CX
	JMP  search_loop_encodeSnappyBetterBlockAsm8B

candidateS_match_encodeSnappyBetterBlockAsm8B:
	SHRQ  $0x08, SI
	MOVQ  SI, R9
	SHLQ  $0x10, R9
	IMULQ R8, R9
	SHRQ  $0x36, R9
	MOVL  24(SP)(R9*4), BX
	INCL  CX
	MOVL  CX, 24(SP)(R9*4)
	CMPL  (DX)(BX*1), SI
	JEQ   candidate_match_encodeSnappyBetterBlockAsm8B
	DECL  CX
	MOVL  DI, BX

candidate_match_encodeSnappyBetterBlockAsm8B:
	MOVL  12(SP), SI
	TESTL BX, BX
	JZ    match_extend_back_end_encodeSnappyBetterBlockAsm8B

match_extend_back_loop_encodeSnappyBetterBlockAsm8B:
	CMPL CX, SI
	JBE  match_extend_back_end_encodeSnappyBetterBlockAsm8B
	MOVB -1(DX)(BX*1), DI
	MOVB -1(DX)(CX*1), R8
	CMPB DI, R8
	JNE  match_extend_back_end_encodeSnappyBetterBlockAsm8B
	LEAL -1(CX), CX
	DECL BX
	JZ   match_extend_back_end_encodeSnappyBetterBlockAsm8B
	JMP  match_extend_back_loop_encodeSnappyBetterBlockAsm8B

match_extend_back_end_encodeSnappyBetterBlockAsm8B:
	MOVL CX, SI
	SUBL 12(SP), SI
	LEAQ 3(AX)(SI*1), SI
	CMPQ SI, (SP)
	JB   match_dst_size_check_encodeSnappyBetterBlockAsm8B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_dst_size_check_encodeSnappyBetterBlockAsm8B:
	MOVL CX, SI
	ADDL $0x04, CX
	ADDL $0x04, BX
	MOVQ src_len+32(FP), DI
	SUBL CX, DI
	LEAQ (DX)(CX*1), R8
	LEAQ (DX)(BX*1), R9

	// matchLen
	XORL R11, R11

matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm8B:
	CMPL DI, $0x10
	JB   matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm8B
	MOVQ (R8)(R11*1), R10
	MOVQ 8(R8)(R11*1), R12
	XORQ (R9)(R11*1), R10
	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm8B
	XORQ 8(R9)(R11*1), R12
	JNZ  matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm8B
	LEAL -16(DI), DI
	LEAL 16(R11), R11
	JMP  matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm8B

matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm8B:
#ifdef GOAMD64_v3
	TZCNTQ R12, R12

#else
	BSFQ R12, R12

#endif
	SARQ $0x03, R12
	LEAL 8(R11)(R12*1), R11
	JMP  match_nolit_end_encodeSnappyBetterBlockAsm8B

matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm8B:
	CMPL DI, $0x08
	JB   matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B
	MOVQ (R8)(R11*1), R10
	XORQ (R9)(R11*1), R10
	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm8B
	LEAL -8(DI), DI
	LEAL 8(R11), R11
	JMP  matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B

matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm8B:
#ifdef GOAMD64_v3
	TZCNTQ R10, R10

#else
	BSFQ R10, R10

#endif
	SARQ $0x03, R10
	LEAL (R11)(R10*1), R11
	JMP  match_nolit_end_encodeSnappyBetterBlockAsm8B

matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B:
	CMPL DI, $0x04
	JB   matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B
	MOVL (R8)(R11*1), R10
	CMPL (R9)(R11*1), R10
	JNE  matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B
	LEAL -4(DI), DI
	LEAL 4(R11), R11

matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B:
	CMPL DI, $0x01
	JE   matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B
	JB   match_nolit_end_encodeSnappyBetterBlockAsm8B
	MOVW (R8)(R11*1), R10
	CMPW (R9)(R11*1), R10
	JNE  matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B
	LEAL 2(R11), R11
	SUBL $0x02, DI
	JZ   match_nolit_end_encodeSnappyBetterBlockAsm8B

matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B:
	MOVB (R8)(R11*1), R10
	CMPB (R9)(R11*1), R10
	JNE  match_nolit_end_encodeSnappyBetterBlockAsm8B
	LEAL 1(R11), R11

match_nolit_end_encodeSnappyBetterBlockAsm8B:
	MOVL CX, DI
	SUBL BX, DI

	// Check if repeat
	MOVL DI, 16(SP)
	MOVL 12(SP), BX
	CMPL BX, SI
	JEQ  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B
	MOVL SI, R8
	MOVL SI, 12(SP)
	LEAQ (DX)(BX*1), R9
	SUBL BX, R8
	LEAL -1(R8), BX
	CMPL BX, $0x3c
	JB   one_byte_match_emit_encodeSnappyBetterBlockAsm8B
	CMPL BX, $0x00000100
	JB   two_bytes_match_emit_encodeSnappyBetterBlockAsm8B
	JB   three_bytes_match_emit_encodeSnappyBetterBlockAsm8B

three_bytes_match_emit_encodeSnappyBetterBlockAsm8B:
	MOVB $0xf4, (AX)
	MOVW BX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm8B

two_bytes_match_emit_encodeSnappyBetterBlockAsm8B:
	MOVB $0xf0, (AX)
	MOVB BL, 1(AX)
	ADDQ $0x02, AX
	CMPL BX, $0x40
	JB   memmove_match_emit_encodeSnappyBetterBlockAsm8B
	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm8B

one_byte_match_emit_encodeSnappyBetterBlockAsm8B:
	SHLB $0x02, BL
	MOVB BL, (AX)
	ADDQ $0x01, AX

memmove_match_emit_encodeSnappyBetterBlockAsm8B:
	LEAQ (AX)(R8*1), BX

	// genMemMoveShort
	CMPQ R8, $0x08
	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8:
	MOVQ (R9), R10
	MOVQ R10, (AX)
	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16:
	MOVQ (R9), R10
	MOVQ -8(R9)(R8*1), R9
	MOVQ R10, (AX)
	MOVQ R9, -8(AX)(R8*1)
	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32:
	MOVOU (R9), X0
	MOVOU -16(R9)(R8*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R8*1)
	JMP   memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64:
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)

memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B:
	MOVQ BX, AX
	JMP  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B

memmove_long_match_emit_encodeSnappyBetterBlockAsm8B:
	LEAQ (AX)(R8*1), BX

	// genMemMoveLong
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVQ  R8, R12
	SHRQ  $0x05, R12
	MOVQ  AX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R13
	SUBQ  R10, R13
	DECQ  R12
	JA    emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
	LEAQ  -32(R9)(R13*1), R10
	LEAQ  -32(AX)(R13*1), R14

emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R14)
	MOVOA X5, 16(R14)
	ADDQ  $0x20, R14
	ADDQ  $0x20, R10
	ADDQ  $0x20, R13
	DECQ  R12
	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32:
	MOVOU -32(R9)(R13*1), X4
	MOVOU -16(R9)(R13*1), X5
	MOVOA X4, -32(AX)(R13*1)
	MOVOA X5, -16(AX)(R13*1)
	ADDQ  $0x20, R13
	CMPQ  R8, R13
	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)
	MOVQ  BX, AX

emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B:
	ADDL R11, CX
	ADDL $0x04, R11
	MOVL CX, 12(SP)

	// emitCopy
two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B:
	CMPL R11, $0x40
	JBE  two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B
	MOVB $0xee, (AX)
	MOVW DI, 1(AX)
	LEAL -60(R11), R11
	ADDQ $0x03, AX
	JMP  two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B

two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B:
	MOVL R11, BX
	SHLL $0x02, BX
	CMPL R11, $0x0c
	JAE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B
	LEAL -15(BX), BX
	MOVB DI, 1(AX)
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, BX
	MOVB BL, (AX)
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B

emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B:
	LEAL -2(BX), BX
	MOVB BL, (AX)
	MOVW DI, 1(AX)
	ADDQ $0x03, AX

match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B:
	CMPL CX, 8(SP)
	JAE  emit_remainder_encodeSnappyBetterBlockAsm8B
	CMPQ AX, (SP)
	JB   match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B
	MOVQ $0x00000000, ret+48(FP)
	RET

match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B:
	MOVQ  $0x0000cf1bbcdcbf9b, BX
	MOVQ  $0x9e3779b1, DI
	LEAQ  1(SI), SI
	LEAQ  -2(CX), R8
	MOVQ  (DX)(SI*1), R9
	MOVQ  1(DX)(SI*1), R10
	MOVQ  (DX)(R8*1), R11
	MOVQ  1(DX)(R8*1), R12
	SHLQ  $0x10, R9
	IMULQ BX, R9
	SHRQ  $0x36, R9
	SHLQ  $0x20, R10
	IMULQ DI, R10
	SHRQ  $0x38, R10
	SHLQ  $0x10, R11
	IMULQ BX, R11
	SHRQ  $0x36, R11
	SHLQ  $0x20, R12
	IMULQ DI, R12
	SHRQ  $0x38, R12
	LEAQ  1(SI), DI
	LEAQ  1(R8), R13
	MOVL  SI, 24(SP)(R9*4)
	MOVL  R8, 24(SP)(R11*4)
	MOVL  DI, 4120(SP)(R10*4)
	MOVL  R13, 4120(SP)(R12*4)
	LEAQ  1(R8)(SI*1), DI
	SHRQ  $0x01, DI
	ADDQ  $0x01, SI
	SUBQ  $0x01, R8

index_loop_encodeSnappyBetterBlockAsm8B:
	CMPQ  DI, R8
	JAE   search_loop_encodeSnappyBetterBlockAsm8B
	MOVQ  (DX)(SI*1), R9
	MOVQ  (DX)(DI*1), R10
	SHLQ  $0x10, R9
	IMULQ BX, R9
	SHRQ  $0x36, R9
	SHLQ  $0x10, R10
	IMULQ BX, R10
	SHRQ  $0x36, R10
	MOVL  SI, 24(SP)(R9*4)
	MOVL  DI, 24(SP)(R10*4)
	ADDQ  $0x02, SI
	ADDQ  $0x02, DI
	JMP   index_loop_encodeSnappyBetterBlockAsm8B

emit_remainder_encodeSnappyBetterBlockAsm8B:
	MOVQ src_len+32(FP), CX
	SUBL 12(SP), CX
	LEAQ 3(AX)(CX*1), CX
	CMPQ CX, (SP)
	JB   emit_remainder_ok_encodeSnappyBetterBlockAsm8B
	MOVQ $0x00000000, ret+48(FP)
	RET

emit_remainder_ok_encodeSnappyBetterBlockAsm8B:
	MOVQ src_len+32(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JB   one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B
	CMPL DX, $0x00000100
	JB   two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B
	JB   three_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B

three_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B:
	MOVB $0xf4, (AX)
	MOVW DX, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B

two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B:
	MOVB $0xf0, (AX)
	MOVB DL, 1(AX)
	ADDQ $0x02, AX
	CMPL DX, $0x40
	JB   memmove_emit_remainder_encodeSnappyBetterBlockAsm8B
	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B

one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B:
	SHLB $0x02, DL
	MOVB DL, (AX)
	ADDQ $0x01, AX

memmove_emit_remainder_encodeSnappyBetterBlockAsm8B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x03
	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_1or2
	JE   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_3
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_1or2:
	MOVB (CX), SI
	MOVB -1(CX)(BX*1), CL
	MOVB SI, (AX)
	MOVB CL, -1(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_3:
	MOVW (CX), SI
	MOVB 2(CX), CL
	MOVW SI, (AX)
	MOVB CL, 2(AX)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_4through7:
	MOVL (CX), SI
	MOVL -4(CX)(BX*1), CX
	MOVL SI, (AX)
	MOVL CX, -4(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(BX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(BX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)

memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B:
	MOVQ DX, AX
	JMP  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B

memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B:
	LEAQ (AX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(BX*1), X2
	MOVOU -16(CX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(BX*1)
	MOVOU X3, -16(AX)(BX*1)
	MOVQ  DX, AX

emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ AX, ret+48(FP)
	RET

// func calcBlockSize(src []byte) int
// Requires: BMI, SSE2
TEXT ·calcBlockSize(SB), $32792-32
	XORQ AX, AX
	MOVQ $0x00000100, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_calcBlockSize:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_calcBlockSize
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+8(FP), CX
	LEAQ  -9(CX), DX
	LEAQ  -8(CX), BX
	MOVL  BX, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  CX, 16(SP)
	MOVQ  src_base+0(FP), DX

search_loop_calcBlockSize:
	MOVL  CX, BX
	SUBL  12(SP), BX
	SHRL  $0x05, BX
	LEAL  4(CX)(BX*1), BX
	CMPL  BX, 8(SP)
	JAE   emit_remainder_calcBlockSize
	MOVQ  (DX)(CX*1), SI
	MOVL  BX, 20(SP)
	MOVQ  $0x0000cf1bbcdcbf9b, R8
	MOVQ  SI, R9
	MOVQ  SI, R10
	SHRQ  $0x08, R10
	SHLQ  $0x10, R9
	IMULQ R8, R9
	SHRQ  $0x33, R9
	SHLQ  $0x10, R10
	IMULQ R8, R10
	SHRQ  $0x33, R10
	MOVL  24(SP)(R9*4), BX
	MOVL  24(SP)(R10*4), DI
	MOVL  CX, 24(SP)(R9*4)
	LEAL  1(CX), R9
	MOVL  R9, 24(SP)(R10*4)
	MOVQ  SI, R9
	SHRQ  $0x10, R9
	SHLQ  $0x10, R9
	IMULQ R8, R9
	SHRQ  $0x33, R9
	MOVL  CX, R8
	SUBL  16(SP), R8
	MOVL  1(DX)(R8*1), R10
	MOVQ  SI, R8
	SHRQ  $0x08, R8
	CMPL  R8, R10
	JNE   no_repeat_found_calcBlockSize
	LEAL  1(CX), SI
	MOVL  12(SP), BX
	MOVL  SI, DI
	SUBL  16(SP), DI
	JZ    repeat_extend_back_end_calcBlockSize

repeat_extend_back_loop_calcBlockSize:
	CMPL SI, BX
	JBE  repeat_extend_back_end_calcBlockSize
	MOVB -1(DX)(DI*1), R8
	MOVB -1(DX)(SI*1), R9
	CMPB R8, R9
	JNE  repeat_extend_back_end_calcBlockSize
	LEAL -1(SI), SI
	DECL DI
	JNZ  repeat_extend_back_loop_calcBlockSize

repeat_extend_back_end_calcBlockSize:
	MOVL SI, BX
	SUBL 12(SP), BX
	LEAQ 5(AX)(BX*1), BX
	CMPQ BX, (SP)
	JB   repeat_dst_size_check_calcBlockSize
	MOVQ $0x00000000, ret+24(FP)
	RET

repeat_dst_size_check_calcBlockSize:
	MOVL 12(SP), BX
	CMPL BX, SI
	JEQ  emit_literal_done_repeat_emit_calcBlockSize
	MOVL SI, DI
	MOVL SI, 12(SP)
	LEAQ (DX)(BX*1), R8
	SUBL BX, DI
	LEAL -1(DI), BX
	CMPL BX, $0x3c
	JB   one_byte_repeat_emit_calcBlockSize
	CMPL BX, $0x00000100
	JB   two_bytes_repeat_emit_calcBlockSize
	CMPL BX, $0x00010000
	JB   three_bytes_repeat_emit_calcBlockSize
	CMPL BX, $0x01000000
	JB   four_bytes_repeat_emit_calcBlockSize
	ADDQ $0x05, AX
	JMP  memmove_long_repeat_emit_calcBlockSize

four_bytes_repeat_emit_calcBlockSize:
	ADDQ $0x04, AX
	JMP  memmove_long_repeat_emit_calcBlockSize

three_bytes_repeat_emit_calcBlockSize:
	ADDQ $0x03, AX
	JMP  memmove_long_repeat_emit_calcBlockSize

two_bytes_repeat_emit_calcBlockSize:
	ADDQ $0x02, AX
	CMPL BX, $0x40
	JB   memmove_repeat_emit_calcBlockSize
	JMP  memmove_long_repeat_emit_calcBlockSize

one_byte_repeat_emit_calcBlockSize:
	ADDQ $0x01, AX

memmove_repeat_emit_calcBlockSize:
	LEAQ (AX)(DI*1), AX
	JMP  emit_literal_done_repeat_emit_calcBlockSize

memmove_long_repeat_emit_calcBlockSize:
	LEAQ (AX)(DI*1), AX

emit_literal_done_repeat_emit_calcBlockSize:
	ADDL $0x05, CX
	MOVL CX, BX
	SUBL 16(SP), BX
	MOVQ src_len+8(FP), DI
	SUBL CX, DI
	LEAQ (DX)(CX*1), R8
	LEAQ (DX)(BX*1), BX

	// matchLen
	XORL R10, R10

matchlen_loopback_16_repeat_extend_calcBlockSize:
	CMPL DI, $0x10
	JB   matchlen_match8_repeat_extend_calcBlockSize
	MOVQ (R8)(R10*1), R9
	MOVQ 8(R8)(R10*1), R11
	XORQ (BX)(R10*1), R9
	JNZ  matchlen_bsf_8_repeat_extend_calcBlockSize
	XORQ 8(BX)(R10*1), R11
	JNZ  matchlen_bsf_16repeat_extend_calcBlockSize
	LEAL -16(DI), DI
	LEAL 16(R10), R10
	JMP  matchlen_loopback_16_repeat_extend_calcBlockSize

matchlen_bsf_16repeat_extend_calcBlockSize:
#ifdef GOAMD64_v3
	TZCNTQ R11, R11

#else
	BSFQ R11, R11

#endif
	SARQ $0x03, R11
	LEAL 8(R10)(R11*1), R10
	JMP  repeat_extend_forward_end_calcBlockSize

matchlen_match8_repeat_extend_calcBlockSize:
	CMPL DI, $0x08
	JB   matchlen_match4_repeat_extend_calcBlockSize
	MOVQ (R8)(R10*1), R9
	XORQ (BX)(R10*1), R9
	JNZ  matchlen_bsf_8_repeat_extend_calcBlockSize
	LEAL -8(DI), DI
	LEAL 8(R10), R10
	JMP  matchlen_match4_repeat_extend_calcBlockSize

matchlen_bsf_8_repeat_extend_calcBlockSize:
#ifdef GOAMD64_v3
	TZCNTQ R9, R9

#else
	BSFQ R9, R9

#endif
	SARQ $0x03, R9
	LEAL (R10)(R9*1), R10
	JMP  repeat_extend_forward_end_calcBlockSize

matchlen_match4_repeat_extend_calcBlockSize:
	CMPL DI, $0x04
	JB   matchlen_match2_repeat_extend_calcBlockSize
	MOVL (R8)(R10*1), R9
	CMPL (BX)(R10*1), R9
	JNE  matchlen_match2_repeat_extend_calcBlockSize
	LEAL -4(DI), DI
	LEAL 4(R10), R10

matchlen_match2_repeat_extend_calcBlockSize:
	CMPL DI, $0x01
	JE   matchlen_match1_repeat_extend_calcBlockSize
	JB   repeat_extend_forward_end_calcBlockSize
	MOVW (R8)(R10*1), R9
	CMPW (BX)(R10*1), R9
	JNE  matchlen_match1_repeat_extend_calcBlockSize
	LEAL 2(R10), R10
	SUBL $0x02, DI
	JZ   repeat_extend_forward_end_calcBlockSize

matchlen_match1_repeat_extend_calcBlockSize:
	MOVB (R8)(R10*1), R9
	CMPB (BX)(R10*1), R9
	JNE  repeat_extend_forward_end_calcBlockSize
	LEAL 1(R10), R10

repeat_extend_forward_end_calcBlockSize:
	ADDL R10, CX
	MOVL CX, BX
	SUBL SI, BX
	MOVL 16(SP), SI

	// emitCopy
	CMPL SI, $0x00010000
	JB   two_byte_offset_repeat_as_copy_calcBlockSize

four_bytes_loop_back_repeat_as_copy_calcBlockSize:
	CMPL BX, $0x40
	JBE  four_bytes_remain_repeat_as_copy_calcBlockSize
	LEAL -64(BX), BX
	ADDQ $0x05, AX
	CMPL BX, $0x04
	JB   four_bytes_remain_repeat_as_copy_calcBlockSize
	JMP  four_bytes_loop_back_repeat_as_copy_calcBlockSize

four_bytes_remain_repeat_as_copy_calcBlockSize:
	TESTL BX, BX
	JZ    repeat_end_emit_calcBlockSize
	XORL  BX, BX
	ADDQ  $0x05, AX
	JMP   repeat_end_emit_calcBlockSize

two_byte_offset_repeat_as_copy_calcBlockSize:
	CMPL BX, $0x40
	JBE  two_byte_offset_short_repeat_as_copy_calcBlockSize
	LEAL -60(BX), BX
	ADDQ $0x03, AX
	JMP  two_byte_offset_repeat_as_copy_calcBlockSize

two_byte_offset_short_repeat_as_copy_calcBlockSize:
	MOVL BX, DI
	SHLL $0x02, DI
	CMPL BX, $0x0c
	JAE  emit_copy_three_repeat_as_copy_calcBlockSize
	CMPL SI, $0x00000800
	JAE  emit_copy_three_repeat_as_copy_calcBlockSize
	ADDQ $0x02, AX
	JMP  repeat_end_emit_calcBlockSize

emit_copy_three_repeat_as_copy_calcBlockSize:
	ADDQ $0x03, AX

repeat_end_emit_calcBlockSize:
	MOVL CX, 12(SP)
	JMP  search_loop_calcBlockSize

no_repeat_found_calcBlockSize:
	CMPL (DX)(BX*1), SI
	JEQ  candidate_match_calcBlockSize
	SHRQ $0x08, SI
	MOVL 24(SP)(R9*4), BX
	LEAL 2(CX), R8
	CMPL (DX)(DI*1), SI
	JEQ  candidate2_match_calcBlockSize
	MOVL R8, 24(SP)(R9*4)
	SHRQ $0x08, SI
	CMPL (DX)(BX*1), SI
	JEQ  candidate3_match_calcBlockSize
	MOVL 20(SP), CX
	JMP  search_loop_calcBlockSize

candidate3_match_calcBlockSize:
	ADDL $0x02, CX
	JMP  candidate_match_calcBlockSize

candidate2_match_calcBlockSize:
	MOVL R8, 24(SP)(R9*4)
	INCL CX
	MOVL DI, BX

candidate_match_calcBlockSize:
	MOVL  12(SP), SI
	TESTL BX, BX
	JZ    match_extend_back_end_calcBlockSize

match_extend_back_loop_calcBlockSize:
	CMPL CX, SI
	JBE  match_extend_back_end_calcBlockSize
	MOVB -1(DX)(BX*1), DI
	MOVB -1(DX)(CX*1), R8
	CMPB DI, R8
	JNE  match_extend_back_end_calcBlockSize
	LEAL -1(CX), CX
	DECL BX
	JZ   match_extend_back_end_calcBlockSize
	JMP  match_extend_back_loop_calcBlockSize

match_extend_back_end_calcBlockSize:
	MOVL CX, SI
	SUBL 12(SP), SI
	LEAQ 5(AX)(SI*1), SI
	CMPQ SI, (SP)
	JB   match_dst_size_check_calcBlockSize
	MOVQ $0x00000000, ret+24(FP)
	RET

match_dst_size_check_calcBlockSize:
	MOVL CX, SI
	MOVL 12(SP), DI
	CMPL DI, SI
	JEQ  emit_literal_done_match_emit_calcBlockSize
	MOVL SI, R8
	MOVL SI, 12(SP)
	LEAQ (DX)(DI*1), SI
	SUBL DI, R8
	LEAL -1(R8), SI
	CMPL SI, $0x3c
	JB   one_byte_match_emit_calcBlockSize
	CMPL SI, $0x00000100
	JB   two_bytes_match_emit_calcBlockSize
	CMPL SI, $0x00010000
	JB   three_bytes_match_emit_calcBlockSize
	CMPL SI, $0x01000000
	JB   four_bytes_match_emit_calcBlockSize
	ADDQ $0x05, AX
	JMP  memmove_long_match_emit_calcBlockSize

four_bytes_match_emit_calcBlockSize:
	ADDQ $0x04, AX
	JMP  memmove_long_match_emit_calcBlockSize

three_bytes_match_emit_calcBlockSize:
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_calcBlockSize

two_bytes_match_emit_calcBlockSize:
	ADDQ $0x02, AX
	CMPL SI, $0x40
	JB   memmove_match_emit_calcBlockSize
	JMP  memmove_long_match_emit_calcBlockSize

one_byte_match_emit_calcBlockSize:
	ADDQ $0x01, AX

memmove_match_emit_calcBlockSize:
	LEAQ (AX)(R8*1), AX
	JMP  emit_literal_done_match_emit_calcBlockSize

memmove_long_match_emit_calcBlockSize:
	LEAQ (AX)(R8*1), AX

emit_literal_done_match_emit_calcBlockSize:
match_nolit_loop_calcBlockSize:
	MOVL CX, SI
	SUBL BX, SI
	MOVL SI, 16(SP)
	ADDL $0x04, CX
	ADDL $0x04, BX
	MOVQ src_len+8(FP), SI
	SUBL CX, SI
	LEAQ (DX)(CX*1), DI
	LEAQ (DX)(BX*1), BX

	// matchLen
	XORL R9, R9

matchlen_loopback_16_match_nolit_calcBlockSize:
	CMPL SI, $0x10
	JB   matchlen_match8_match_nolit_calcBlockSize
	MOVQ (DI)(R9*1), R8
	MOVQ 8(DI)(R9*1), R10
	XORQ (BX)(R9*1), R8
	JNZ  matchlen_bsf_8_match_nolit_calcBlockSize
	XORQ 8(BX)(R9*1), R10
	JNZ  matchlen_bsf_16match_nolit_calcBlockSize
	LEAL -16(SI), SI
	LEAL 16(R9), R9
	JMP  matchlen_loopback_16_match_nolit_calcBlockSize

matchlen_bsf_16match_nolit_calcBlockSize:
#ifdef GOAMD64_v3
	TZCNTQ R10, R10

#else
	BSFQ R10, R10

#endif
	SARQ $0x03, R10
	LEAL 8(R9)(R10*1), R9
	JMP  match_nolit_end_calcBlockSize

matchlen_match8_match_nolit_calcBlockSize:
	CMPL SI, $0x08
	JB   matchlen_match4_match_nolit_calcBlockSize
	MOVQ (DI)(R9*1), R8
	XORQ (BX)(R9*1), R8
	JNZ  matchlen_bsf_8_match_nolit_calcBlockSize
	LEAL -8(SI), SI
	LEAL 8(R9), R9
	JMP  matchlen_match4_match_nolit_calcBlockSize

matchlen_bsf_8_match_nolit_calcBlockSize:
#ifdef GOAMD64_v3
	TZCNTQ R8, R8

#else
	BSFQ R8, R8

#endif
	SARQ $0x03, R8
	LEAL (R9)(R8*1), R9
	JMP  match_nolit_end_calcBlockSize

matchlen_match4_match_nolit_calcBlockSize:
	CMPL SI, $0x04
	JB   matchlen_match2_match_nolit_calcBlockSize
	MOVL (DI)(R9*1), R8
	CMPL (BX)(R9*1), R8
	JNE  matchlen_match2_match_nolit_calcBlockSize
	LEAL -4(SI), SI
	LEAL 4(R9), R9

matchlen_match2_match_nolit_calcBlockSize:
	CMPL SI, $0x01
	JE   matchlen_match1_match_nolit_calcBlockSize
	JB   match_nolit_end_calcBlockSize
	MOVW (DI)(R9*1), R8
	CMPW (BX)(R9*1), R8
	JNE  matchlen_match1_match_nolit_calcBlockSize
	LEAL 2(R9), R9
	SUBL $0x02, SI
	JZ   match_nolit_end_calcBlockSize

matchlen_match1_match_nolit_calcBlockSize:
	MOVB (DI)(R9*1), R8
	CMPB (BX)(R9*1), R8
	JNE  match_nolit_end_calcBlockSize
	LEAL 1(R9), R9

match_nolit_end_calcBlockSize:
	ADDL R9, CX
	MOVL 16(SP), BX
	ADDL $0x04, R9
	MOVL CX, 12(SP)

	// emitCopy
	CMPL BX, $0x00010000
	JB   two_byte_offset_match_nolit_calcBlockSize

four_bytes_loop_back_match_nolit_calcBlockSize:
	CMPL R9, $0x40
	JBE  four_bytes_remain_match_nolit_calcBlockSize
	LEAL -64(R9), R9
	ADDQ $0x05, AX
	CMPL R9, $0x04
	JB   four_bytes_remain_match_nolit_calcBlockSize
	JMP  four_bytes_loop_back_match_nolit_calcBlockSize

four_bytes_remain_match_nolit_calcBlockSize:
	TESTL R9, R9
	JZ    match_nolit_emitcopy_end_calcBlockSize
	XORL  BX, BX
	ADDQ  $0x05, AX
	JMP   match_nolit_emitcopy_end_calcBlockSize

two_byte_offset_match_nolit_calcBlockSize:
	CMPL R9, $0x40
	JBE  two_byte_offset_short_match_nolit_calcBlockSize
	LEAL -60(R9), R9
	ADDQ $0x03, AX
	JMP  two_byte_offset_match_nolit_calcBlockSize

two_byte_offset_short_match_nolit_calcBlockSize:
	MOVL R9, SI
	SHLL $0x02, SI
	CMPL R9, $0x0c
	JAE  emit_copy_three_match_nolit_calcBlockSize
	CMPL BX, $0x00000800
	JAE  emit_copy_three_match_nolit_calcBlockSize
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_calcBlockSize

emit_copy_three_match_nolit_calcBlockSize:
	ADDQ $0x03, AX

match_nolit_emitcopy_end_calcBlockSize:
	CMPL CX, 8(SP)
	JAE  emit_remainder_calcBlockSize
	MOVQ -2(DX)(CX*1), SI
	CMPQ AX, (SP)
	JB   match_nolit_dst_ok_calcBlockSize
	MOVQ $0x00000000, ret+24(FP)
	RET

match_nolit_dst_ok_calcBlockSize:
	MOVQ  $0x0000cf1bbcdcbf9b, R8
	MOVQ  SI, DI
	SHRQ  $0x10, SI
	MOVQ  SI, BX
	SHLQ  $0x10, DI
	IMULQ R8, DI
	SHRQ  $0x33, DI
	SHLQ  $0x10, BX
	IMULQ R8, BX
	SHRQ  $0x33, BX
	LEAL  -2(CX), R8
	LEAQ  24(SP)(BX*4), R9
	MOVL  (R9), BX
	MOVL  R8, 24(SP)(DI*4)
	MOVL  CX, (R9)
	CMPL  (DX)(BX*1), SI
	JEQ   match_nolit_loop_calcBlockSize
	INCL  CX
	JMP   search_loop_calcBlockSize

emit_remainder_calcBlockSize:
	MOVQ src_len+8(FP), CX
	SUBL 12(SP), CX
	LEAQ 5(AX)(CX*1), CX
	CMPQ CX, (SP)
	JB   emit_remainder_ok_calcBlockSize
	MOVQ $0x00000000, ret+24(FP)
	RET

emit_remainder_ok_calcBlockSize:
	MOVQ src_len+8(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_calcBlockSize
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), CX
	CMPL CX, $0x3c
	JB   one_byte_emit_remainder_calcBlockSize
	CMPL CX, $0x00000100
	JB   two_bytes_emit_remainder_calcBlockSize
	CMPL CX, $0x00010000
	JB   three_bytes_emit_remainder_calcBlockSize
	CMPL CX, $0x01000000
	JB   four_bytes_emit_remainder_calcBlockSize
	ADDQ $0x05, AX
	JMP  memmove_long_emit_remainder_calcBlockSize

four_bytes_emit_remainder_calcBlockSize:
	ADDQ $0x04, AX
	JMP  memmove_long_emit_remainder_calcBlockSize

three_bytes_emit_remainder_calcBlockSize:
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_calcBlockSize

two_bytes_emit_remainder_calcBlockSize:
	ADDQ $0x02, AX
	CMPL CX, $0x40
	JB   memmove_emit_remainder_calcBlockSize
	JMP  memmove_long_emit_remainder_calcBlockSize

one_byte_emit_remainder_calcBlockSize:
	ADDQ $0x01, AX

memmove_emit_remainder_calcBlockSize:
	LEAQ (AX)(SI*1), AX
	JMP  emit_literal_done_emit_remainder_calcBlockSize

memmove_long_emit_remainder_calcBlockSize:
	LEAQ (AX)(SI*1), AX

emit_literal_done_emit_remainder_calcBlockSize:
	MOVQ AX, ret+24(FP)
	RET

// func calcBlockSizeSmall(src []byte) int
// Requires: BMI, SSE2
TEXT ·calcBlockSizeSmall(SB), $2072-32
	XORQ AX, AX
	MOVQ $0x00000010, CX
	LEAQ 24(SP), DX
	PXOR X0, X0

zero_loop_calcBlockSizeSmall:
	MOVOU X0, (DX)
	MOVOU X0, 16(DX)
	MOVOU X0, 32(DX)
	MOVOU X0, 48(DX)
	MOVOU X0, 64(DX)
	MOVOU X0, 80(DX)
	MOVOU X0, 96(DX)
	MOVOU X0, 112(DX)
	ADDQ  $0x80, DX
	DECQ  CX
	JNZ   zero_loop_calcBlockSizeSmall
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+8(FP), CX
	LEAQ  -9(CX), DX
	LEAQ  -8(CX), BX
	MOVL  BX, 8(SP)
	SHRQ  $0x05, CX
	SUBL  CX, DX
	LEAQ  (AX)(DX*1), DX
	MOVQ  DX, (SP)
	MOVL  $0x00000001, CX
	MOVL  CX, 16(SP)
	MOVQ  src_base+0(FP), DX

search_loop_calcBlockSizeSmall:
	MOVL  CX, BX
	SUBL  12(SP), BX
	SHRL  $0x04, BX
	LEAL  4(CX)(BX*1), BX
	CMPL  BX, 8(SP)
	JAE   emit_remainder_calcBlockSizeSmall
	MOVQ  (DX)(CX*1), SI
	MOVL  BX, 20(SP)
	MOVQ  $0x9e3779b1, R8
	MOVQ  SI, R9
	MOVQ  SI, R10
	SHRQ  $0x08, R10
	SHLQ  $0x20, R9
	IMULQ R8, R9
	SHRQ  $0x37, R9
	SHLQ  $0x20, R10
	IMULQ R8, R10
	SHRQ  $0x37, R10
	MOVL  24(SP)(R9*4), BX
	MOVL  24(SP)(R10*4), DI
	MOVL  CX, 24(SP)(R9*4)
	LEAL  1(CX), R9
	MOVL  R9, 24(SP)(R10*4)
	MOVQ  SI, R9
	SHRQ  $0x10, R9
	SHLQ  $0x20, R9
	IMULQ R8, R9
	SHRQ  $0x37, R9
	MOVL  CX, R8
	SUBL  16(SP), R8
	MOVL  1(DX)(R8*1), R10
	MOVQ  SI, R8
	SHRQ  $0x08, R8
	CMPL  R8, R10
	JNE   no_repeat_found_calcBlockSizeSmall
	LEAL  1(CX), SI
	MOVL  12(SP), BX
	MOVL  SI, DI
	SUBL  16(SP), DI
	JZ    repeat_extend_back_end_calcBlockSizeSmall

repeat_extend_back_loop_calcBlockSizeSmall:
	CMPL SI, BX
	JBE  repeat_extend_back_end_calcBlockSizeSmall
	MOVB -1(DX)(DI*1), R8
	MOVB -1(DX)(SI*1), R9
	CMPB R8, R9
	JNE  repeat_extend_back_end_calcBlockSizeSmall
	LEAL -1(SI), SI
	DECL DI
	JNZ  repeat_extend_back_loop_calcBlockSizeSmall

repeat_extend_back_end_calcBlockSizeSmall:
	MOVL SI, BX
	SUBL 12(SP), BX
	LEAQ 3(AX)(BX*1), BX
	CMPQ BX, (SP)
	JB   repeat_dst_size_check_calcBlockSizeSmall
	MOVQ $0x00000000, ret+24(FP)
	RET

repeat_dst_size_check_calcBlockSizeSmall:
	MOVL 12(SP), BX
	CMPL BX, SI
	JEQ  emit_literal_done_repeat_emit_calcBlockSizeSmall
	MOVL SI, DI
	MOVL SI, 12(SP)
	LEAQ (DX)(BX*1), R8
	SUBL BX, DI
	LEAL -1(DI), BX
	CMPL BX, $0x3c
	JB   one_byte_repeat_emit_calcBlockSizeSmall
	CMPL BX, $0x00000100
	JB   two_bytes_repeat_emit_calcBlockSizeSmall
	JB   three_bytes_repeat_emit_calcBlockSizeSmall

three_bytes_repeat_emit_calcBlockSizeSmall:
	ADDQ $0x03, AX
	JMP  memmove_long_repeat_emit_calcBlockSizeSmall

two_bytes_repeat_emit_calcBlockSizeSmall:
	ADDQ $0x02, AX
	CMPL BX, $0x40
	JB   memmove_repeat_emit_calcBlockSizeSmall
	JMP  memmove_long_repeat_emit_calcBlockSizeSmall

one_byte_repeat_emit_calcBlockSizeSmall:
	ADDQ $0x01, AX

memmove_repeat_emit_calcBlockSizeSmall:
	LEAQ (AX)(DI*1), AX
	JMP  emit_literal_done_repeat_emit_calcBlockSizeSmall

memmove_long_repeat_emit_calcBlockSizeSmall:
	LEAQ (AX)(DI*1), AX

emit_literal_done_repeat_emit_calcBlockSizeSmall:
	ADDL $0x05, CX
	MOVL CX, BX
	SUBL 16(SP), BX
	MOVQ src_len+8(FP), DI
	SUBL CX, DI
	LEAQ (DX)(CX*1), R8
	LEAQ (DX)(BX*1), BX

	// matchLen
	XORL R10, R10

matchlen_loopback_16_repeat_extend_calcBlockSizeSmall:
	CMPL DI, $0x10
	JB   matchlen_match8_repeat_extend_calcBlockSizeSmall
	MOVQ (R8)(R10*1), R9
	MOVQ 8(R8)(R10*1), R11
	XORQ (BX)(R10*1), R9
	JNZ  matchlen_bsf_8_repeat_extend_calcBlockSizeSmall
	XORQ 8(BX)(R10*1), R11
	JNZ  matchlen_bsf_16repeat_extend_calcBlockSizeSmall
	LEAL -16(DI), DI
	LEAL 16(R10), R10
	JMP  matchlen_loopback_16_repeat_extend_calcBlockSizeSmall

matchlen_bsf_16repeat_extend_calcBlockSizeSmall:
#ifdef GOAMD64_v3
	TZCNTQ R11, R11

#else
	BSFQ R11, R11

#endif
	SARQ $0x03, R11
	LEAL 8(R10)(R11*1), R10
	JMP  repeat_extend_forward_end_calcBlockSizeSmall

matchlen_match8_repeat_extend_calcBlockSizeSmall:
	CMPL DI, $0x08
	JB   matchlen_match4_repeat_extend_calcBlockSizeSmall
	MOVQ (R8)(R10*1), R9
	XORQ (BX)(R10*1), R9
	JNZ  matchlen_bsf_8_repeat_extend_calcBlockSizeSmall
	LEAL -8(DI), DI
	LEAL 8(R10), R10
	JMP  matchlen_match4_repeat_extend_calcBlockSizeSmall

matchlen_bsf_8_repeat_extend_calcBlockSizeSmall:
#ifdef GOAMD64_v3
	TZCNTQ R9, R9

#else
	BSFQ R9, R9

#endif
	SARQ $0x03, R9
	LEAL (R10)(R9*1), R10
	JMP  repeat_extend_forward_end_calcBlockSizeSmall

matchlen_match4_repeat_extend_calcBlockSizeSmall:
	CMPL DI, $0x04
	JB   matchlen_match2_repeat_extend_calcBlockSizeSmall
	MOVL (R8)(R10*1), R9
	CMPL (BX)(R10*1), R9
	JNE  matchlen_match2_repeat_extend_calcBlockSizeSmall
	LEAL -4(DI), DI
	LEAL 4(R10), R10

matchlen_match2_repeat_extend_calcBlockSizeSmall:
	CMPL DI, $0x01
	JE   matchlen_match1_repeat_extend_calcBlockSizeSmall
	JB   repeat_extend_forward_end_calcBlockSizeSmall
	MOVW (R8)(R10*1), R9
	CMPW (BX)(R10*1), R9
	JNE  matchlen_match1_repeat_extend_calcBlockSizeSmall
	LEAL 2(R10), R10
	SUBL $0x02, DI
	JZ   repeat_extend_forward_end_calcBlockSizeSmall

matchlen_match1_repeat_extend_calcBlockSizeSmall:
	MOVB (R8)(R10*1), R9
	CMPB (BX)(R10*1), R9
	JNE  repeat_extend_forward_end_calcBlockSizeSmall
	LEAL 1(R10), R10

repeat_extend_forward_end_calcBlockSizeSmall:
	ADDL R10, CX
	MOVL CX, BX
	SUBL SI, BX
	MOVL 16(SP), SI

	// emitCopy
two_byte_offset_repeat_as_copy_calcBlockSizeSmall:
	CMPL BX, $0x40
	JBE  two_byte_offset_short_repeat_as_copy_calcBlockSizeSmall
	LEAL -60(BX), BX
	ADDQ $0x03, AX
	JMP  two_byte_offset_repeat_as_copy_calcBlockSizeSmall

two_byte_offset_short_repeat_as_copy_calcBlockSizeSmall:
	MOVL BX, SI
	SHLL $0x02, SI
	CMPL BX, $0x0c
	JAE  emit_copy_three_repeat_as_copy_calcBlockSizeSmall
	ADDQ $0x02, AX
	JMP  repeat_end_emit_calcBlockSizeSmall

emit_copy_three_repeat_as_copy_calcBlockSizeSmall:
	ADDQ $0x03, AX

repeat_end_emit_calcBlockSizeSmall:
	MOVL CX, 12(SP)
	JMP  search_loop_calcBlockSizeSmall

no_repeat_found_calcBlockSizeSmall:
	CMPL (DX)(BX*1), SI
	JEQ  candidate_match_calcBlockSizeSmall
	SHRQ $0x08, SI
	MOVL 24(SP)(R9*4), BX
	LEAL 2(CX), R8
	CMPL (DX)(DI*1), SI
	JEQ  candidate2_match_calcBlockSizeSmall
	MOVL R8, 24(SP)(R9*4)
	SHRQ $0x08, SI
	CMPL (DX)(BX*1), SI
	JEQ  candidate3_match_calcBlockSizeSmall
	MOVL 20(SP), CX
	JMP  search_loop_calcBlockSizeSmall

candidate3_match_calcBlockSizeSmall:
	ADDL $0x02, CX
	JMP  candidate_match_calcBlockSizeSmall

candidate2_match_calcBlockSizeSmall:
	MOVL R8, 24(SP)(R9*4)
	INCL CX
	MOVL DI, BX

candidate_match_calcBlockSizeSmall:
	MOVL  12(SP), SI
	TESTL BX, BX
	JZ    match_extend_back_end_calcBlockSizeSmall

match_extend_back_loop_calcBlockSizeSmall:
	CMPL CX, SI
	JBE  match_extend_back_end_calcBlockSizeSmall
	MOVB -1(DX)(BX*1), DI
	MOVB -1(DX)(CX*1), R8
	CMPB DI, R8
	JNE  match_extend_back_end_calcBlockSizeSmall
	LEAL -1(CX), CX
	DECL BX
	JZ   match_extend_back_end_calcBlockSizeSmall
	JMP  match_extend_back_loop_calcBlockSizeSmall

match_extend_back_end_calcBlockSizeSmall:
	MOVL CX, SI
	SUBL 12(SP), SI
	LEAQ 3(AX)(SI*1), SI
	CMPQ SI, (SP)
	JB   match_dst_size_check_calcBlockSizeSmall
	MOVQ $0x00000000, ret+24(FP)
	RET

match_dst_size_check_calcBlockSizeSmall:
	MOVL CX, SI
	MOVL 12(SP), DI
	CMPL DI, SI
	JEQ  emit_literal_done_match_emit_calcBlockSizeSmall
	MOVL SI, R8
	MOVL SI, 12(SP)
	LEAQ (DX)(DI*1), SI
	SUBL DI, R8
	LEAL -1(R8), SI
	CMPL SI, $0x3c
	JB   one_byte_match_emit_calcBlockSizeSmall
	CMPL SI, $0x00000100
	JB   two_bytes_match_emit_calcBlockSizeSmall
	JB   three_bytes_match_emit_calcBlockSizeSmall

three_bytes_match_emit_calcBlockSizeSmall:
	ADDQ $0x03, AX
	JMP  memmove_long_match_emit_calcBlockSizeSmall

two_bytes_match_emit_calcBlockSizeSmall:
	ADDQ $0x02, AX
	CMPL SI, $0x40
	JB   memmove_match_emit_calcBlockSizeSmall
	JMP  memmove_long_match_emit_calcBlockSizeSmall

one_byte_match_emit_calcBlockSizeSmall:
	ADDQ $0x01, AX

memmove_match_emit_calcBlockSizeSmall:
	LEAQ (AX)(R8*1), AX
	JMP  emit_literal_done_match_emit_calcBlockSizeSmall

memmove_long_match_emit_calcBlockSizeSmall:
	LEAQ (AX)(R8*1), AX

emit_literal_done_match_emit_calcBlockSizeSmall:
match_nolit_loop_calcBlockSizeSmall:
	MOVL CX, SI
	SUBL BX, SI
	MOVL SI, 16(SP)
	ADDL $0x04, CX
	ADDL $0x04, BX
	MOVQ src_len+8(FP), SI
	SUBL CX, SI
	LEAQ (DX)(CX*1), DI
	LEAQ (DX)(BX*1), BX

	// matchLen
	XORL R9, R9

matchlen_loopback_16_match_nolit_calcBlockSizeSmall:
	CMPL SI, $0x10
	JB   matchlen_match8_match_nolit_calcBlockSizeSmall
	MOVQ (DI)(R9*1), R8
	MOVQ 8(DI)(R9*1), R10
	XORQ (BX)(R9*1), R8
	JNZ  matchlen_bsf_8_match_nolit_calcBlockSizeSmall
	XORQ 8(BX)(R9*1), R10
	JNZ  matchlen_bsf_16match_nolit_calcBlockSizeSmall
	LEAL -16(SI), SI
	LEAL 16(R9), R9
	JMP  matchlen_loopback_16_match_nolit_calcBlockSizeSmall

matchlen_bsf_16match_nolit_calcBlockSizeSmall:
#ifdef GOAMD64_v3
	TZCNTQ R10, R10

#else
	BSFQ R10, R10

#endif
	SARQ $0x03, R10
	LEAL 8(R9)(R10*1), R9
	JMP  match_nolit_end_calcBlockSizeSmall

matchlen_match8_match_nolit_calcBlockSizeSmall:
	CMPL SI, $0x08
	JB   matchlen_match4_match_nolit_calcBlockSizeSmall
	MOVQ (DI)(R9*1), R8
	XORQ (BX)(R9*1), R8
	JNZ  matchlen_bsf_8_match_nolit_calcBlockSizeSmall
	LEAL -8(SI), SI
	LEAL 8(R9), R9
	JMP  matchlen_match4_match_nolit_calcBlockSizeSmall

matchlen_bsf_8_match_nolit_calcBlockSizeSmall:
#ifdef GOAMD64_v3
	TZCNTQ R8, R8

#else
	BSFQ R8, R8

#endif
	SARQ $0x03, R8
	LEAL (R9)(R8*1), R9
	JMP  match_nolit_end_calcBlockSizeSmall

matchlen_match4_match_nolit_calcBlockSizeSmall:
	CMPL SI, $0x04
	JB   matchlen_match2_match_nolit_calcBlockSizeSmall
	MOVL (DI)(R9*1), R8
	CMPL (BX)(R9*1), R8
	JNE  matchlen_match2_match_nolit_calcBlockSizeSmall
	LEAL -4(SI), SI
	LEAL 4(R9), R9

matchlen_match2_match_nolit_calcBlockSizeSmall:
	CMPL SI, $0x01
	JE   matchlen_match1_match_nolit_calcBlockSizeSmall
	JB   match_nolit_end_calcBlockSizeSmall
	MOVW (DI)(R9*1), R8
	CMPW (BX)(R9*1), R8
	JNE  matchlen_match1_match_nolit_calcBlockSizeSmall
	LEAL 2(R9), R9
	SUBL $0x02, SI
	JZ   match_nolit_end_calcBlockSizeSmall

matchlen_match1_match_nolit_calcBlockSizeSmall:
	MOVB (DI)(R9*1), R8
	CMPB (BX)(R9*1), R8
	JNE  match_nolit_end_calcBlockSizeSmall
	LEAL 1(R9), R9

match_nolit_end_calcBlockSizeSmall:
	ADDL R9, CX
	MOVL 16(SP), BX
	ADDL $0x04, R9
	MOVL CX, 12(SP)

	// emitCopy
two_byte_offset_match_nolit_calcBlockSizeSmall:
	CMPL R9, $0x40
	JBE  two_byte_offset_short_match_nolit_calcBlockSizeSmall
	LEAL -60(R9), R9
	ADDQ $0x03, AX
	JMP  two_byte_offset_match_nolit_calcBlockSizeSmall

two_byte_offset_short_match_nolit_calcBlockSizeSmall:
	MOVL R9, BX
	SHLL $0x02, BX
	CMPL R9, $0x0c
	JAE  emit_copy_three_match_nolit_calcBlockSizeSmall
	ADDQ $0x02, AX
	JMP  match_nolit_emitcopy_end_calcBlockSizeSmall

emit_copy_three_match_nolit_calcBlockSizeSmall:
	ADDQ $0x03, AX

match_nolit_emitcopy_end_calcBlockSizeSmall:
	CMPL CX, 8(SP)
	JAE  emit_remainder_calcBlockSizeSmall
	MOVQ -2(DX)(CX*1), SI
	CMPQ AX, (SP)
	JB   match_nolit_dst_ok_calcBlockSizeSmall
	MOVQ $0x00000000, ret+24(FP)
	RET

match_nolit_dst_ok_calcBlockSizeSmall:
	MOVQ  $0x9e3779b1, R8
	MOVQ  SI, DI
	SHRQ  $0x10, SI
	MOVQ  SI, BX
	SHLQ  $0x20, DI
	IMULQ R8, DI
	SHRQ  $0x37, DI
	SHLQ  $0x20, BX
	IMULQ R8, BX
	SHRQ  $0x37, BX
	LEAL  -2(CX), R8
	LEAQ  24(SP)(BX*4), R9
	MOVL  (R9), BX
	MOVL  R8, 24(SP)(DI*4)
	MOVL  CX, (R9)
	CMPL  (DX)(BX*1), SI
	JEQ   match_nolit_loop_calcBlockSizeSmall
	INCL  CX
	JMP   search_loop_calcBlockSizeSmall

emit_remainder_calcBlockSizeSmall:
	MOVQ src_len+8(FP), CX
	SUBL 12(SP), CX
	LEAQ 3(AX)(CX*1), CX
	CMPQ CX, (SP)
	JB   emit_remainder_ok_calcBlockSizeSmall
	MOVQ $0x00000000, ret+24(FP)
	RET

emit_remainder_ok_calcBlockSizeSmall:
	MOVQ src_len+8(FP), CX
	MOVL 12(SP), BX
	CMPL BX, CX
	JEQ  emit_literal_done_emit_remainder_calcBlockSizeSmall
	MOVL CX, SI
	MOVL CX, 12(SP)
	LEAQ (DX)(BX*1), CX
	SUBL BX, SI
	LEAL -1(SI), CX
	CMPL CX, $0x3c
	JB   one_byte_emit_remainder_calcBlockSizeSmall
	CMPL CX, $0x00000100
	JB   two_bytes_emit_remainder_calcBlockSizeSmall
	JB   three_bytes_emit_remainder_calcBlockSizeSmall

three_bytes_emit_remainder_calcBlockSizeSmall:
	ADDQ $0x03, AX
	JMP  memmove_long_emit_remainder_calcBlockSizeSmall

two_bytes_emit_remainder_calcBlockSizeSmall:
	ADDQ $0x02, AX
	CMPL CX, $0x40
	JB   memmove_emit_remainder_calcBlockSizeSmall
	JMP  memmove_long_emit_remainder_calcBlockSizeSmall

one_byte_emit_remainder_calcBlockSizeSmall:
	ADDQ $0x01, AX

memmove_emit_remainder_calcBlockSizeSmall:
	LEAQ (AX)(SI*1), AX
	JMP  emit_literal_done_emit_remainder_calcBlockSizeSmall

memmove_long_emit_remainder_calcBlockSizeSmall:
	LEAQ (AX)(SI*1), AX

emit_literal_done_emit_remainder_calcBlockSizeSmall:
	MOVQ AX, ret+24(FP)
	RET

// func emitLiteral(dst []byte, lit []byte) int
// Requires: SSE2
TEXT ·emitLiteral(SB), NOSPLIT, $0-56
	MOVQ  lit_len+32(FP), DX
	MOVQ  dst_base+0(FP), AX
	MOVQ  lit_base+24(FP), CX
	TESTQ DX, DX
	JZ    emit_literal_end_standalone_skip
	MOVL  DX, BX
	LEAL  -1(DX), SI
	CMPL  SI, $0x3c
	JB    one_byte_standalone
	CMPL  SI, $0x00000100
	JB    two_bytes_standalone
	CMPL  SI, $0x00010000
	JB    three_bytes_standalone
	CMPL  SI, $0x01000000
	JB    four_bytes_standalone
	MOVB  $0xfc, (AX)
	MOVL  SI, 1(AX)
	ADDQ  $0x05, BX
	ADDQ  $0x05, AX
	JMP   memmove_long_standalone

four_bytes_standalone:
	MOVL SI, DI
	SHRL $0x10, DI
	MOVB $0xf8, (AX)
	MOVW SI, 1(AX)
	MOVB DI, 3(AX)
	ADDQ $0x04, BX
	ADDQ $0x04, AX
	JMP  memmove_long_standalone

three_bytes_standalone:
	MOVB $0xf4, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, BX
	ADDQ $0x03, AX
	JMP  memmove_long_standalone

two_bytes_standalone:
	MOVB $0xf0, (AX)
	MOVB SI, 1(AX)
	ADDQ $0x02, BX
	ADDQ $0x02, AX
	CMPL SI, $0x40
	JB   memmove_standalone
	JMP  memmove_long_standalone

one_byte_standalone:
	SHLB $0x02, SI
	MOVB SI, (AX)
	ADDQ $0x01, BX
	ADDQ $0x01, AX

memmove_standalone:
	// genMemMoveShort
	CMPQ DX, $0x03
	JB   emit_lit_memmove_standalone_memmove_move_1or2
	JE   emit_lit_memmove_standalone_memmove_move_3
	CMPQ DX, $0x08
	JB   emit_lit_memmove_standalone_memmove_move_4through7
	CMPQ DX, $0x10
	JBE  emit_lit_memmove_standalone_memmove_move_8through16
	CMPQ DX, $0x20
	JBE  emit_lit_memmove_standalone_memmove_move_17through32
	JMP  emit_lit_memmove_standalone_memmove_move_33through64

emit_lit_memmove_standalone_memmove_move_1or2:
	MOVB (CX), SI
	MOVB -1(CX)(DX*1), CL
	MOVB SI, (AX)
	MOVB CL, -1(AX)(DX*1)
	JMP  emit_literal_end_standalone

emit_lit_memmove_standalone_memmove_move_3:
	MOVW (CX), SI
	MOVB 2(CX), CL
	MOVW SI, (AX)
	MOVB CL, 2(AX)
	JMP  emit_literal_end_standalone

emit_lit_memmove_standalone_memmove_move_4through7:
	MOVL (CX), SI
	MOVL -4(CX)(DX*1), CX
	MOVL SI, (AX)
	MOVL CX, -4(AX)(DX*1)
	JMP  emit_literal_end_standalone

emit_lit_memmove_standalone_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(DX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(DX*1)
	JMP  emit_literal_end_standalone

emit_lit_memmove_standalone_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(DX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(DX*1)
	JMP   emit_literal_end_standalone

emit_lit_memmove_standalone_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(DX*1), X2
	MOVOU -16(CX)(DX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(DX*1)
	MOVOU X3, -16(AX)(DX*1)
	JMP   emit_literal_end_standalone
	JMP emit_literal_end_standalone

memmove_long_standalone:
	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(DX*1), X2
	MOVOU -16(CX)(DX*1), X3
	MOVQ  DX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_standalonelarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_standalonelarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_standalonelarge_big_loop_back

emit_lit_memmove_long_standalonelarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  DX, R8
	JAE   emit_lit_memmove_long_standalonelarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(DX*1)
	MOVOU X3, -16(AX)(DX*1)
	JMP   emit_literal_end_standalone
	JMP emit_literal_end_standalone

emit_literal_end_standalone_skip:
	XORQ BX, BX

emit_literal_end_standalone:
	MOVQ BX, ret+48(FP)
	RET

// func emitRepeat(dst []byte, offset int, length int) int
TEXT ·emitRepeat(SB), NOSPLIT, $0-48
	XORQ BX, BX
	MOVQ dst_base+0(FP), AX
	MOVQ offset+24(FP), CX
	MOVQ length+32(FP), DX

	// emitRepeat
emit_repeat_again_standalone:
	MOVL DX, SI
	LEAL -4(DX), DX
	CMPL SI, $0x08
	JBE  repeat_two_standalone
	CMPL SI, $0x0c
	JAE  cant_repeat_two_offset_standalone
	CMPL CX, $0x00000800
	JB   repeat_two_offset_standalone

cant_repeat_two_offset_standalone:
	CMPL DX, $0x00000104
	JB   repeat_three_standalone
	CMPL DX, $0x00010100
	JB   repeat_four_standalone
	CMPL DX, $0x0100ffff
	JB   repeat_five_standalone
	LEAL -16842747(DX), DX
	MOVL $0xfffb001d, (AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	ADDQ $0x05, BX
	JMP  emit_repeat_again_standalone

repeat_five_standalone:
	LEAL -65536(DX), DX
	MOVL DX, CX
	MOVW $0x001d, (AX)
	MOVW DX, 2(AX)
	SARL $0x10, CX
	MOVB CL, 4(AX)
	ADDQ $0x05, BX
	ADDQ $0x05, AX
	JMP  gen_emit_repeat_end

repeat_four_standalone:
	LEAL -256(DX), DX
	MOVW $0x0019, (AX)
	MOVW DX, 2(AX)
	ADDQ $0x04, BX
	ADDQ $0x04, AX
	JMP  gen_emit_repeat_end

repeat_three_standalone:
	LEAL -4(DX), DX
	MOVW $0x0015, (AX)
	MOVB DL, 2(AX)
	ADDQ $0x03, BX
	ADDQ $0x03, AX
	JMP  gen_emit_repeat_end

repeat_two_standalone:
	SHLL $0x02, DX
	ORL  $0x01, DX
	MOVW DX, (AX)
	ADDQ $0x02, BX
	ADDQ $0x02, AX
	JMP  gen_emit_repeat_end

repeat_two_offset_standalone:
	XORQ SI, SI
	LEAL 1(SI)(DX*4), DX
	MOVB CL, 1(AX)
	SARL $0x08, CX
	SHLL $0x05, CX
	ORL  CX, DX
	MOVB DL, (AX)
	ADDQ $0x02, BX
	ADDQ $0x02, AX

gen_emit_repeat_end:
	MOVQ BX, ret+40(FP)
	RET

// func emitCopy(dst []byte, offset int, length int) int
TEXT ·emitCopy(SB), NOSPLIT, $0-48
	XORQ BX, BX
	MOVQ dst_base+0(FP), AX
	MOVQ offset+24(FP), CX
	MOVQ length+32(FP), DX

	// emitCopy
	CMPL CX, $0x00010000
	JB   two_byte_offset_standalone
	CMPL DX, $0x40
	JBE  four_bytes_remain_standalone
	MOVB $0xff, (AX)
	MOVL CX, 1(AX)
	LEAL -64(DX), DX
	ADDQ $0x05, BX
	ADDQ $0x05, AX
	CMPL DX, $0x04
	JB   four_bytes_remain_standalone

	// emitRepeat
emit_repeat_again_standalone_emit_copy:
	MOVL DX, SI
	LEAL -4(DX), DX
	CMPL SI, $0x08
	JBE  repeat_two_standalone_emit_copy
	CMPL SI, $0x0c
	JAE  cant_repeat_two_offset_standalone_emit_copy
	CMPL CX, $0x00000800
	JB   repeat_two_offset_standalone_emit_copy

cant_repeat_two_offset_standalone_emit_copy:
	CMPL DX, $0x00000104
	JB   repeat_three_standalone_emit_copy
	CMPL DX, $0x00010100
	JB   repeat_four_standalone_emit_copy
	CMPL DX, $0x0100ffff
	JB   repeat_five_standalone_emit_copy
	LEAL -16842747(DX), DX
	MOVL $0xfffb001d, (AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	ADDQ $0x05, BX
	JMP  emit_repeat_again_standalone_emit_copy

repeat_five_standalone_emit_copy:
	LEAL -65536(DX), DX
	MOVL DX, CX
	MOVW $0x001d, (AX)
	MOVW DX, 2(AX)
	SARL $0x10, CX
	MOVB CL, 4(AX)
	ADDQ $0x05, BX
	ADDQ $0x05, AX
	JMP  gen_emit_copy_end

repeat_four_standalone_emit_copy:
	LEAL -256(DX), DX
	MOVW $0x0019, (AX)
	MOVW DX, 2(AX)
	ADDQ $0x04, BX
	ADDQ $0x04, AX
	JMP  gen_emit_copy_end

repeat_three_standalone_emit_copy:
	LEAL -4(DX), DX
	MOVW $0x0015, (AX)
	MOVB DL, 2(AX)
	ADDQ $0x03, BX
	ADDQ $0x03, AX
	JMP  gen_emit_copy_end

repeat_two_standalone_emit_copy:
	SHLL $0x02, DX
	ORL  $0x01, DX
	MOVW DX, (AX)
	ADDQ $0x02, BX
	ADDQ $0x02, AX
	JMP  gen_emit_copy_end

repeat_two_offset_standalone_emit_copy:
	XORQ SI, SI
	LEAL 1(SI)(DX*4), DX
	MOVB CL, 1(AX)
	SARL $0x08, CX
	SHLL $0x05, CX
	ORL  CX, DX
	MOVB DL, (AX)
	ADDQ $0x02, BX
	ADDQ $0x02, AX
	JMP  gen_emit_copy_end

four_bytes_remain_standalone:
	TESTL DX, DX
	JZ    gen_emit_copy_end
	XORL  SI, SI
	LEAL  -1(SI)(DX*4), DX
	MOVB  DL, (AX)
	MOVL  CX, 1(AX)
	ADDQ  $0x05, BX
	ADDQ  $0x05, AX
	JMP   gen_emit_copy_end

two_byte_offset_standalone:
	CMPL DX, $0x40
	JBE  two_byte_offset_short_standalone
	CMPL CX, $0x00000800
	JAE  long_offset_short_standalone
	MOVL $0x00000001, SI
	LEAL 16(SI), SI
	MOVB CL, 1(AX)
	MOVL CX, DI
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (AX)
	ADDQ $0x02, BX
	ADDQ $0x02, AX
	SUBL $0x08, DX

	// emitRepeat
	LEAL -4(DX), DX
	JMP  cant_repeat_two_offset_standalone_emit_copy_short_2b

emit_repeat_again_standalone_emit_copy_short_2b:
	MOVL DX, SI
	LEAL -4(DX), DX
	CMPL SI, $0x08
	JBE  repeat_two_standalone_emit_copy_short_2b
	CMPL SI, $0x0c
	JAE  cant_repeat_two_offset_standalone_emit_copy_short_2b
	CMPL CX, $0x00000800
	JB   repeat_two_offset_standalone_emit_copy_short_2b

cant_repeat_two_offset_standalone_emit_copy_short_2b:
	CMPL DX, $0x00000104
	JB   repeat_three_standalone_emit_copy_short_2b
	CMPL DX, $0x00010100
	JB   repeat_four_standalone_emit_copy_short_2b
	CMPL DX, $0x0100ffff
	JB   repeat_five_standalone_emit_copy_short_2b
	LEAL -16842747(DX), DX
	MOVL $0xfffb001d, (AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	ADDQ $0x05, BX
	JMP  emit_repeat_again_standalone_emit_copy_short_2b

repeat_five_standalone_emit_copy_short_2b:
	LEAL -65536(DX), DX
	MOVL DX, CX
	MOVW $0x001d, (AX)
	MOVW DX, 2(AX)
	SARL $0x10, CX
	MOVB CL, 4(AX)
	ADDQ $0x05, BX
	ADDQ $0x05, AX
	JMP  gen_emit_copy_end

repeat_four_standalone_emit_copy_short_2b:
	LEAL -256(DX), DX
	MOVW $0x0019, (AX)
	MOVW DX, 2(AX)
	ADDQ $0x04, BX
	ADDQ $0x04, AX
	JMP  gen_emit_copy_end

repeat_three_standalone_emit_copy_short_2b:
	LEAL -4(DX), DX
	MOVW $0x0015, (AX)
	MOVB DL, 2(AX)
	ADDQ $0x03, BX
	ADDQ $0x03, AX
	JMP  gen_emit_copy_end

repeat_two_standalone_emit_copy_short_2b:
	SHLL $0x02, DX
	ORL  $0x01, DX
	MOVW DX, (AX)
	ADDQ $0x02, BX
	ADDQ $0x02, AX
	JMP  gen_emit_copy_end

repeat_two_offset_standalone_emit_copy_short_2b:
	XORQ SI, SI
	LEAL 1(SI)(DX*4), DX
	MOVB CL, 1(AX)
	SARL $0x08, CX
	SHLL $0x05, CX
	ORL  CX, DX
	MOVB DL, (AX)
	ADDQ $0x02, BX
	ADDQ $0x02, AX
	JMP  gen_emit_copy_end

long_offset_short_standalone:
	MOVB $0xee, (AX)
	MOVW CX, 1(AX)
	LEAL -60(DX), DX
	ADDQ $0x03, AX
	ADDQ $0x03, BX

	// emitRepeat
emit_repeat_again_standalone_emit_copy_short:
	MOVL DX, SI
	LEAL -4(DX), DX
	CMPL SI, $0x08
	JBE  repeat_two_standalone_emit_copy_short
	CMPL SI, $0x0c
	JAE  cant_repeat_two_offset_standalone_emit_copy_short
	CMPL CX, $0x00000800
	JB   repeat_two_offset_standalone_emit_copy_short

cant_repeat_two_offset_standalone_emit_copy_short:
	CMPL DX, $0x00000104
	JB   repeat_three_standalone_emit_copy_short
	CMPL DX, $0x00010100
	JB   repeat_four_standalone_emit_copy_short
	CMPL DX, $0x0100ffff
	JB   repeat_five_standalone_emit_copy_short
	LEAL -16842747(DX), DX
	MOVL $0xfffb001d, (AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	ADDQ $0x05, BX
	JMP  emit_repeat_again_standalone_emit_copy_short

repeat_five_standalone_emit_copy_short:
	LEAL -65536(DX), DX
	MOVL DX, CX
	MOVW $0x001d, (AX)
	MOVW DX, 2(AX)
	SARL $0x10, CX
	MOVB CL, 4(AX)
	ADDQ $0x05, BX
	ADDQ $0x05, AX
	JMP  gen_emit_copy_end

repeat_four_standalone_emit_copy_short:
	LEAL -256(DX), DX
	MOVW $0x0019, (AX)
	MOVW DX, 2(AX)
	ADDQ $0x04, BX
	ADDQ $0x04, AX
	JMP  gen_emit_copy_end

repeat_three_standalone_emit_copy_short:
	LEAL -4(DX), DX
	MOVW $0x0015, (AX)
	MOVB DL, 2(AX)
	ADDQ $0x03, BX
	ADDQ $0x03, AX
	JMP  gen_emit_copy_end

repeat_two_standalone_emit_copy_short:
	SHLL $0x02, DX
	ORL  $0x01, DX
	MOVW DX, (AX)
	ADDQ $0x02, BX
	ADDQ $0x02, AX
	JMP  gen_emit_copy_end

repeat_two_offset_standalone_emit_copy_short:
	XORQ SI, SI
	LEAL 1(SI)(DX*4), DX
	MOVB CL, 1(AX)
	SARL $0x08, CX
	SHLL $0x05, CX
	ORL  CX, DX
	MOVB DL, (AX)
	ADDQ $0x02, BX
	ADDQ $0x02, AX
	JMP  gen_emit_copy_end

two_byte_offset_short_standalone:
	MOVL DX, SI
	SHLL $0x02, SI
	CMPL DX, $0x0c
	JAE  emit_copy_three_standalone
	CMPL CX, $0x00000800
	JAE  emit_copy_three_standalone
	LEAL -15(SI), SI
	MOVB CL, 1(AX)
	SHRL $0x08, CX
	SHLL $0x05, CX
	ORL  CX, SI
	MOVB SI, (AX)
	ADDQ $0x02, BX
	ADDQ $0x02, AX
	JMP  gen_emit_copy_end

emit_copy_three_standalone:
	LEAL -2(SI), SI
	MOVB SI, (AX)
	MOVW CX, 1(AX)
	ADDQ $0x03, BX
	ADDQ $0x03, AX

gen_emit_copy_end:
	MOVQ BX, ret+40(FP)
	RET

// func emitCopyNoRepeat(dst []byte, offset int, length int) int
TEXT ·emitCopyNoRepeat(SB), NOSPLIT, $0-48
	XORQ BX, BX
	MOVQ dst_base+0(FP), AX
	MOVQ offset+24(FP), CX
	MOVQ length+32(FP), DX

	// emitCopy
	CMPL CX, $0x00010000
	JB   two_byte_offset_standalone_snappy

four_bytes_loop_back_standalone_snappy:
	CMPL DX, $0x40
	JBE  four_bytes_remain_standalone_snappy
	MOVB $0xff, (AX)
	MOVL CX, 1(AX)
	LEAL -64(DX), DX
	ADDQ $0x05, BX
	ADDQ $0x05, AX
	CMPL DX, $0x04
	JB   four_bytes_remain_standalone_snappy
	JMP  four_bytes_loop_back_standalone_snappy

four_bytes_remain_standalone_snappy:
	TESTL DX, DX
	JZ    gen_emit_copy_end_snappy
	XORL  SI, SI
	LEAL  -1(SI)(DX*4), DX
	MOVB  DL, (AX)
	MOVL  CX, 1(AX)
	ADDQ  $0x05, BX
	ADDQ  $0x05, AX
	JMP   gen_emit_copy_end_snappy

two_byte_offset_standalone_snappy:
	CMPL DX, $0x40
	JBE  two_byte_offset_short_standalone_snappy
	MOVB $0xee, (AX)
	MOVW CX, 1(AX)
	LEAL -60(DX), DX
	ADDQ $0x03, AX
	ADDQ $0x03, BX
	JMP  two_byte_offset_standalone_snappy

two_byte_offset_short_standalone_snappy:
	MOVL DX, SI
	SHLL $0x02, SI
	CMPL DX, $0x0c
	JAE  emit_copy_three_standalone_snappy
	CMPL CX, $0x00000800
	JAE  emit_copy_three_standalone_snappy
	LEAL -15(SI), SI
	MOVB CL, 1(AX)
	SHRL $0x08, CX
	SHLL $0x05, CX
	ORL  CX, SI
	MOVB SI, (AX)
	ADDQ $0x02, BX
	ADDQ $0x02, AX
	JMP  gen_emit_copy_end_snappy

emit_copy_three_standalone_snappy:
	LEAL -2(SI), SI
	MOVB SI, (AX)
	MOVW CX, 1(AX)
	ADDQ $0x03, BX
	ADDQ $0x03, AX

gen_emit_copy_end_snappy:
	MOVQ BX, ret+40(FP)
	RET

// func matchLen(a []byte, b []byte) int
// Requires: BMI
TEXT ·matchLen(SB), NOSPLIT, $0-56
	MOVQ a_base+0(FP), AX
	MOVQ b_base+24(FP), CX
	MOVQ a_len+8(FP), DX

	// matchLen
	XORL SI, SI

matchlen_loopback_16_standalone:
	CMPL DX, $0x10
	JB   matchlen_match8_standalone
	MOVQ (AX)(SI*1), BX
	MOVQ 8(AX)(SI*1), DI
	XORQ (CX)(SI*1), BX
	JNZ  matchlen_bsf_8_standalone
	XORQ 8(CX)(SI*1), DI
	JNZ  matchlen_bsf_16standalone
	LEAL -16(DX), DX
	LEAL 16(SI), SI
	JMP  matchlen_loopback_16_standalone

matchlen_bsf_16standalone:
#ifdef GOAMD64_v3
	TZCNTQ DI, DI

#else
	BSFQ DI, DI

#endif
	SARQ $0x03, DI
	LEAL 8(SI)(DI*1), SI
	JMP  gen_match_len_end

matchlen_match8_standalone:
	CMPL DX, $0x08
	JB   matchlen_match4_standalone
	MOVQ (AX)(SI*1), BX
	XORQ (CX)(SI*1), BX
	JNZ  matchlen_bsf_8_standalone
	LEAL -8(DX), DX
	LEAL 8(SI), SI
	JMP  matchlen_match4_standalone

matchlen_bsf_8_standalone:
#ifdef GOAMD64_v3
	TZCNTQ BX, BX

#else
	BSFQ BX, BX

#endif
	SARQ $0x03, BX
	LEAL (SI)(BX*1), SI
	JMP  gen_match_len_end

matchlen_match4_standalone:
	CMPL DX, $0x04
	JB   matchlen_match2_standalone
	MOVL (AX)(SI*1), BX
	CMPL (CX)(SI*1), BX
	JNE  matchlen_match2_standalone
	LEAL -4(DX), DX
	LEAL 4(SI), SI

matchlen_match2_standalone:
	CMPL DX, $0x01
	JE   matchlen_match1_standalone
	JB   gen_match_len_end
	MOVW (AX)(SI*1), BX
	CMPW (CX)(SI*1), BX
	JNE  matchlen_match1_standalone
	LEAL 2(SI), SI
	SUBL $0x02, DX
	JZ   gen_match_len_end

matchlen_match1_standalone:
	MOVB (AX)(SI*1), BL
	CMPB (CX)(SI*1), BL
	JNE  gen_match_len_end
	LEAL 1(SI), SI

gen_match_len_end:
	MOVQ SI, ret+48(FP)
	RET

// func cvtLZ4BlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
// Requires: SSE2
TEXT ·cvtLZ4BlockAsm(SB), NOSPLIT, $0-64
	XORQ SI, SI
	MOVQ dst_base+0(FP), AX
	MOVQ dst_len+8(FP), CX
	MOVQ src_base+24(FP), DX
	MOVQ src_len+32(FP), BX
	LEAQ (DX)(BX*1), BX
	LEAQ -10(AX)(CX*1), CX
	XORQ DI, DI

lz4_s2_loop:
	CMPQ    DX, BX
	JAE     lz4_s2_corrupt
	CMPQ    AX, CX
	JAE     lz4_s2_dstfull
	MOVBQZX (DX), R8
	MOVQ    R8, R9
	MOVQ    R8, R10
	SHRQ    $0x04, R9
	ANDQ    $0x0f, R10
	CMPQ    R8, $0xf0
	JB      lz4_s2_ll_end

lz4_s2_ll_loop:
	INCQ    DX
	CMPQ    DX, BX
	JAE     lz4_s2_corrupt
	MOVBQZX (DX), R8
	ADDQ    R8, R9
	CMPQ    R8, $0xff
	JEQ     lz4_s2_ll_loop

lz4_s2_ll_end:
	LEAQ  (DX)(R9*1), R8
	ADDQ  $0x04, R10
	CMPQ  R8, BX
	JAE   lz4_s2_corrupt
	INCQ  DX
	INCQ  R8
	TESTQ R9, R9
	JZ    lz4_s2_lits_done
	LEAQ  (AX)(R9*1), R11
	CMPQ  R11, CX
	JAE   lz4_s2_dstfull
	ADDQ  R9, SI
	LEAL  -1(R9), R11
	CMPL  R11, $0x3c
	JB    one_byte_lz4_s2
	CMPL  R11, $0x00000100
	JB    two_bytes_lz4_s2
	CMPL  R11, $0x00010000
	JB    three_bytes_lz4_s2
	CMPL  R11, $0x01000000
	JB    four_bytes_lz4_s2
	MOVB  $0xfc, (AX)
	MOVL  R11, 1(AX)
	ADDQ  $0x05, AX
	JMP   memmove_long_lz4_s2

four_bytes_lz4_s2:
	MOVL R11, R12
	SHRL $0x10, R12
	MOVB $0xf8, (AX)
	MOVW R11, 1(AX)
	MOVB R12, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_lz4_s2

three_bytes_lz4_s2:
	MOVB $0xf4, (AX)
	MOVW R11, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_lz4_s2

two_bytes_lz4_s2:
	MOVB $0xf0, (AX)
	MOVB R11, 1(AX)
	ADDQ $0x02, AX
	CMPL R11, $0x40
	JB   memmove_lz4_s2
	JMP  memmove_long_lz4_s2

one_byte_lz4_s2:
	SHLB $0x02, R11
	MOVB R11, (AX)
	ADDQ $0x01, AX

memmove_lz4_s2:
	LEAQ (AX)(R9*1), R11

	// genMemMoveShort
	CMPQ R9, $0x08
	JBE  emit_lit_memmove_lz4_s2_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_lz4_s2_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_lz4_s2_memmove_move_17through32
	JMP  emit_lit_memmove_lz4_s2_memmove_move_33through64

emit_lit_memmove_lz4_s2_memmove_move_8:
	MOVQ (DX), R12
	MOVQ R12, (AX)
	JMP  memmove_end_copy_lz4_s2

emit_lit_memmove_lz4_s2_memmove_move_8through16:
	MOVQ (DX), R12
	MOVQ -8(DX)(R9*1), DX
	MOVQ R12, (AX)
	MOVQ DX, -8(AX)(R9*1)
	JMP  memmove_end_copy_lz4_s2

emit_lit_memmove_lz4_s2_memmove_move_17through32:
	MOVOU (DX), X0
	MOVOU -16(DX)(R9*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R9*1)
	JMP   memmove_end_copy_lz4_s2

emit_lit_memmove_lz4_s2_memmove_move_33through64:
	MOVOU (DX), X0
	MOVOU 16(DX), X1
	MOVOU -32(DX)(R9*1), X2
	MOVOU -16(DX)(R9*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)

memmove_end_copy_lz4_s2:
	MOVQ R11, AX
	JMP  lz4_s2_lits_emit_done

memmove_long_lz4_s2:
	LEAQ (AX)(R9*1), R11

	// genMemMoveLong
	MOVOU (DX), X0
	MOVOU 16(DX), X1
	MOVOU -32(DX)(R9*1), X2
	MOVOU -16(DX)(R9*1), X3
	MOVQ  R9, R13
	SHRQ  $0x05, R13
	MOVQ  AX, R12
	ANDL  $0x0000001f, R12
	MOVQ  $0x00000040, R14
	SUBQ  R12, R14
	DECQ  R13
	JA    emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32
	LEAQ  -32(DX)(R14*1), R12
	LEAQ  -32(AX)(R14*1), R15

emit_lit_memmove_long_lz4_s2large_big_loop_back:
	MOVOU (R12), X4
	MOVOU 16(R12), X5
	MOVOA X4, (R15)
	MOVOA X5, 16(R15)
	ADDQ  $0x20, R15
	ADDQ  $0x20, R12
	ADDQ  $0x20, R14
	DECQ  R13
	JNA   emit_lit_memmove_long_lz4_s2large_big_loop_back

emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32:
	MOVOU -32(DX)(R14*1), X4
	MOVOU -16(DX)(R14*1), X5
	MOVOA X4, -32(AX)(R14*1)
	MOVOA X5, -16(AX)(R14*1)
	ADDQ  $0x20, R14
	CMPQ  R9, R14
	JAE   emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)
	MOVQ  R11, AX

lz4_s2_lits_emit_done:
	MOVQ R8, DX

lz4_s2_lits_done:
	CMPQ DX, BX
	JNE  lz4_s2_match
	CMPQ R10, $0x04
	JEQ  lz4_s2_done
	JMP  lz4_s2_corrupt

lz4_s2_match:
	LEAQ    2(DX), R8
	CMPQ    R8, BX
	JAE     lz4_s2_corrupt
	MOVWQZX (DX), R9
	MOVQ    R8, DX
	TESTQ   R9, R9
	JZ      lz4_s2_corrupt
	CMPQ    R9, SI
	JA      lz4_s2_corrupt
	CMPQ    R10, $0x13
	JNE     lz4_s2_ml_done

lz4_s2_ml_loop:
	MOVBQZX (DX), R8
	INCQ    DX
	ADDQ    R8, R10
	CMPQ    DX, BX
	JAE     lz4_s2_corrupt
	CMPQ    R8, $0xff
	JEQ     lz4_s2_ml_loop

lz4_s2_ml_done:
	ADDQ R10, SI
	CMPQ R9, DI
	JNE  lz4_s2_docopy

	// emitRepeat
emit_repeat_again_lz4_s2:
	MOVL R10, R8
	LEAL -4(R10), R10
	CMPL R8, $0x08
	JBE  repeat_two_lz4_s2
	CMPL R8, $0x0c
	JAE  cant_repeat_two_offset_lz4_s2
	CMPL R9, $0x00000800
	JB   repeat_two_offset_lz4_s2

cant_repeat_two_offset_lz4_s2:
	CMPL R10, $0x00000104
	JB   repeat_three_lz4_s2
	CMPL R10, $0x00010100
	JB   repeat_four_lz4_s2
	CMPL R10, $0x0100ffff
	JB   repeat_five_lz4_s2
	LEAL -16842747(R10), R10
	MOVL $0xfffb001d, (AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	JMP  emit_repeat_again_lz4_s2

repeat_five_lz4_s2:
	LEAL -65536(R10), R10
	MOVL R10, R9
	MOVW $0x001d, (AX)
	MOVW R10, 2(AX)
	SARL $0x10, R9
	MOVB R9, 4(AX)
	ADDQ $0x05, AX
	JMP  lz4_s2_loop

repeat_four_lz4_s2:
	LEAL -256(R10), R10
	MOVW $0x0019, (AX)
	MOVW R10, 2(AX)
	ADDQ $0x04, AX
	JMP  lz4_s2_loop

repeat_three_lz4_s2:
	LEAL -4(R10), R10
	MOVW $0x0015, (AX)
	MOVB R10, 2(AX)
	ADDQ $0x03, AX
	JMP  lz4_s2_loop

repeat_two_lz4_s2:
	SHLL $0x02, R10
	ORL  $0x01, R10
	MOVW R10, (AX)
	ADDQ $0x02, AX
	JMP  lz4_s2_loop

repeat_two_offset_lz4_s2:
	XORQ R8, R8
	LEAL 1(R8)(R10*4), R10
	MOVB R9, 1(AX)
	SARL $0x08, R9
	SHLL $0x05, R9
	ORL  R9, R10
	MOVB R10, (AX)
	ADDQ $0x02, AX
	JMP  lz4_s2_loop

lz4_s2_docopy:
	MOVQ R9, DI

	// emitCopy
	CMPL R10, $0x40
	JBE  two_byte_offset_short_lz4_s2
	CMPL R9, $0x00000800
	JAE  long_offset_short_lz4_s2
	MOVL $0x00000001, R8
	LEAL 16(R8), R8
	MOVB R9, 1(AX)
	MOVL R9, R11
	SHRL $0x08, R11
	SHLL $0x05, R11
	ORL  R11, R8
	MOVB R8, (AX)
	ADDQ $0x02, AX
	SUBL $0x08, R10

	// emitRepeat
	LEAL -4(R10), R10
	JMP  cant_repeat_two_offset_lz4_s2_emit_copy_short_2b

emit_repeat_again_lz4_s2_emit_copy_short_2b:
	MOVL R10, R8
	LEAL -4(R10), R10
	CMPL R8, $0x08
	JBE  repeat_two_lz4_s2_emit_copy_short_2b
	CMPL R8, $0x0c
	JAE  cant_repeat_two_offset_lz4_s2_emit_copy_short_2b
	CMPL R9, $0x00000800
	JB   repeat_two_offset_lz4_s2_emit_copy_short_2b

cant_repeat_two_offset_lz4_s2_emit_copy_short_2b:
	CMPL R10, $0x00000104
	JB   repeat_three_lz4_s2_emit_copy_short_2b
	CMPL R10, $0x00010100
	JB   repeat_four_lz4_s2_emit_copy_short_2b
	CMPL R10, $0x0100ffff
	JB   repeat_five_lz4_s2_emit_copy_short_2b
	LEAL -16842747(R10), R10
	MOVL $0xfffb001d, (AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	JMP  emit_repeat_again_lz4_s2_emit_copy_short_2b

repeat_five_lz4_s2_emit_copy_short_2b:
	LEAL -65536(R10), R10
	MOVL R10, R9
	MOVW $0x001d, (AX)
	MOVW R10, 2(AX)
	SARL $0x10, R9
	MOVB R9, 4(AX)
	ADDQ $0x05, AX
	JMP  lz4_s2_loop

repeat_four_lz4_s2_emit_copy_short_2b:
	LEAL -256(R10), R10
	MOVW $0x0019, (AX)
	MOVW R10, 2(AX)
	ADDQ $0x04, AX
	JMP  lz4_s2_loop

repeat_three_lz4_s2_emit_copy_short_2b:
	LEAL -4(R10), R10
	MOVW $0x0015, (AX)
	MOVB R10, 2(AX)
	ADDQ $0x03, AX
	JMP  lz4_s2_loop

repeat_two_lz4_s2_emit_copy_short_2b:
	SHLL $0x02, R10
	ORL  $0x01, R10
	MOVW R10, (AX)
	ADDQ $0x02, AX
	JMP  lz4_s2_loop

repeat_two_offset_lz4_s2_emit_copy_short_2b:
	XORQ R8, R8
	LEAL 1(R8)(R10*4), R10
	MOVB R9, 1(AX)
	SARL $0x08, R9
	SHLL $0x05, R9
	ORL  R9, R10
	MOVB R10, (AX)
	ADDQ $0x02, AX
	JMP  lz4_s2_loop

long_offset_short_lz4_s2:
	MOVB $0xee, (AX)
	MOVW R9, 1(AX)
	LEAL -60(R10), R10
	ADDQ $0x03, AX

	// emitRepeat
emit_repeat_again_lz4_s2_emit_copy_short:
	MOVL R10, R8
	LEAL -4(R10), R10
	CMPL R8, $0x08
	JBE  repeat_two_lz4_s2_emit_copy_short
	CMPL R8, $0x0c
	JAE  cant_repeat_two_offset_lz4_s2_emit_copy_short
	CMPL R9, $0x00000800
	JB   repeat_two_offset_lz4_s2_emit_copy_short

cant_repeat_two_offset_lz4_s2_emit_copy_short:
	CMPL R10, $0x00000104
	JB   repeat_three_lz4_s2_emit_copy_short
	CMPL R10, $0x00010100
	JB   repeat_four_lz4_s2_emit_copy_short
	CMPL R10, $0x0100ffff
	JB   repeat_five_lz4_s2_emit_copy_short
	LEAL -16842747(R10), R10
	MOVL $0xfffb001d, (AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	JMP  emit_repeat_again_lz4_s2_emit_copy_short

repeat_five_lz4_s2_emit_copy_short:
	LEAL -65536(R10), R10
	MOVL R10, R9
	MOVW $0x001d, (AX)
	MOVW R10, 2(AX)
	SARL $0x10, R9
	MOVB R9, 4(AX)
	ADDQ $0x05, AX
	JMP  lz4_s2_loop

repeat_four_lz4_s2_emit_copy_short:
	LEAL -256(R10), R10
	MOVW $0x0019, (AX)
	MOVW R10, 2(AX)
	ADDQ $0x04, AX
	JMP  lz4_s2_loop

repeat_three_lz4_s2_emit_copy_short:
	LEAL -4(R10), R10
	MOVW $0x0015, (AX)
	MOVB R10, 2(AX)
	ADDQ $0x03, AX
	JMP  lz4_s2_loop

repeat_two_lz4_s2_emit_copy_short:
	SHLL $0x02, R10
	ORL  $0x01, R10
	MOVW R10, (AX)
	ADDQ $0x02, AX
	JMP  lz4_s2_loop

repeat_two_offset_lz4_s2_emit_copy_short:
	XORQ R8, R8
	LEAL 1(R8)(R10*4), R10
	MOVB R9, 1(AX)
	SARL $0x08, R9
	SHLL $0x05, R9
	ORL  R9, R10
	MOVB R10, (AX)
	ADDQ $0x02, AX
	JMP  lz4_s2_loop

two_byte_offset_short_lz4_s2:
	MOVL R10, R8
	SHLL $0x02, R8
	CMPL R10, $0x0c
	JAE  emit_copy_three_lz4_s2
	CMPL R9, $0x00000800
	JAE  emit_copy_three_lz4_s2
	LEAL -15(R8), R8
	MOVB R9, 1(AX)
	SHRL $0x08, R9
	SHLL $0x05, R9
	ORL  R9, R8
	MOVB R8, (AX)
	ADDQ $0x02, AX
	JMP  lz4_s2_loop

emit_copy_three_lz4_s2:
	LEAL -2(R8), R8
	MOVB R8, (AX)
	MOVW R9, 1(AX)
	ADDQ $0x03, AX
	JMP  lz4_s2_loop

lz4_s2_done:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ SI, uncompressed+48(FP)
	MOVQ AX, dstUsed+56(FP)
	RET

lz4_s2_corrupt:
	XORQ AX, AX
	LEAQ -1(AX), SI
	MOVQ SI, uncompressed+48(FP)
	RET

lz4_s2_dstfull:
	XORQ AX, AX
	LEAQ -2(AX), SI
	MOVQ SI, uncompressed+48(FP)
	RET

// func cvtLZ4sBlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
// Requires: SSE2
TEXT ·cvtLZ4sBlockAsm(SB), NOSPLIT, $0-64
	XORQ SI, SI
	MOVQ dst_base+0(FP), AX
	MOVQ dst_len+8(FP), CX
	MOVQ src_base+24(FP), DX
	MOVQ src_len+32(FP), BX
	LEAQ (DX)(BX*1), BX
	LEAQ -10(AX)(CX*1), CX
	XORQ DI, DI

lz4s_s2_loop:
	CMPQ    DX, BX
	JAE     lz4s_s2_corrupt
	CMPQ    AX, CX
	JAE     lz4s_s2_dstfull
	MOVBQZX (DX), R8
	MOVQ    R8, R9
	MOVQ    R8, R10
	SHRQ    $0x04, R9
	ANDQ    $0x0f, R10
	CMPQ    R8, $0xf0
	JB      lz4s_s2_ll_end

lz4s_s2_ll_loop:
	INCQ    DX
	CMPQ    DX, BX
	JAE     lz4s_s2_corrupt
	MOVBQZX (DX), R8
	ADDQ    R8, R9
	CMPQ    R8, $0xff
	JEQ     lz4s_s2_ll_loop

lz4s_s2_ll_end:
	LEAQ  (DX)(R9*1), R8
	ADDQ  $0x03, R10
	CMPQ  R8, BX
	JAE   lz4s_s2_corrupt
	INCQ  DX
	INCQ  R8
	TESTQ R9, R9
	JZ    lz4s_s2_lits_done
	LEAQ  (AX)(R9*1), R11
	CMPQ  R11, CX
	JAE   lz4s_s2_dstfull
	ADDQ  R9, SI
	LEAL  -1(R9), R11
	CMPL  R11, $0x3c
	JB    one_byte_lz4s_s2
	CMPL  R11, $0x00000100
	JB    two_bytes_lz4s_s2
	CMPL  R11, $0x00010000
	JB    three_bytes_lz4s_s2
	CMPL  R11, $0x01000000
	JB    four_bytes_lz4s_s2
	MOVB  $0xfc, (AX)
	MOVL  R11, 1(AX)
	ADDQ  $0x05, AX
	JMP   memmove_long_lz4s_s2

four_bytes_lz4s_s2:
	MOVL R11, R12
	SHRL $0x10, R12
	MOVB $0xf8, (AX)
	MOVW R11, 1(AX)
	MOVB R12, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_lz4s_s2

three_bytes_lz4s_s2:
	MOVB $0xf4, (AX)
	MOVW R11, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_lz4s_s2

two_bytes_lz4s_s2:
	MOVB $0xf0, (AX)
	MOVB R11, 1(AX)
	ADDQ $0x02, AX
	CMPL R11, $0x40
	JB   memmove_lz4s_s2
	JMP  memmove_long_lz4s_s2

one_byte_lz4s_s2:
	SHLB $0x02, R11
	MOVB R11, (AX)
	ADDQ $0x01, AX

memmove_lz4s_s2:
	LEAQ (AX)(R9*1), R11

	// genMemMoveShort
	CMPQ R9, $0x08
	JBE  emit_lit_memmove_lz4s_s2_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_lz4s_s2_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_lz4s_s2_memmove_move_17through32
	JMP  emit_lit_memmove_lz4s_s2_memmove_move_33through64

emit_lit_memmove_lz4s_s2_memmove_move_8:
	MOVQ (DX), R12
	MOVQ R12, (AX)
	JMP  memmove_end_copy_lz4s_s2

emit_lit_memmove_lz4s_s2_memmove_move_8through16:
	MOVQ (DX), R12
	MOVQ -8(DX)(R9*1), DX
	MOVQ R12, (AX)
	MOVQ DX, -8(AX)(R9*1)
	JMP  memmove_end_copy_lz4s_s2

emit_lit_memmove_lz4s_s2_memmove_move_17through32:
	MOVOU (DX), X0
	MOVOU -16(DX)(R9*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R9*1)
	JMP   memmove_end_copy_lz4s_s2

emit_lit_memmove_lz4s_s2_memmove_move_33through64:
	MOVOU (DX), X0
	MOVOU 16(DX), X1
	MOVOU -32(DX)(R9*1), X2
	MOVOU -16(DX)(R9*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)

memmove_end_copy_lz4s_s2:
	MOVQ R11, AX
	JMP  lz4s_s2_lits_emit_done

memmove_long_lz4s_s2:
	LEAQ (AX)(R9*1), R11

	// genMemMoveLong
	MOVOU (DX), X0
	MOVOU 16(DX), X1
	MOVOU -32(DX)(R9*1), X2
	MOVOU -16(DX)(R9*1), X3
	MOVQ  R9, R13
	SHRQ  $0x05, R13
	MOVQ  AX, R12
	ANDL  $0x0000001f, R12
	MOVQ  $0x00000040, R14
	SUBQ  R12, R14
	DECQ  R13
	JA    emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32
	LEAQ  -32(DX)(R14*1), R12
	LEAQ  -32(AX)(R14*1), R15

emit_lit_memmove_long_lz4s_s2large_big_loop_back:
	MOVOU (R12), X4
	MOVOU 16(R12), X5
	MOVOA X4, (R15)
	MOVOA X5, 16(R15)
	ADDQ  $0x20, R15
	ADDQ  $0x20, R12
	ADDQ  $0x20, R14
	DECQ  R13
	JNA   emit_lit_memmove_long_lz4s_s2large_big_loop_back

emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32:
	MOVOU -32(DX)(R14*1), X4
	MOVOU -16(DX)(R14*1), X5
	MOVOA X4, -32(AX)(R14*1)
	MOVOA X5, -16(AX)(R14*1)
	ADDQ  $0x20, R14
	CMPQ  R9, R14
	JAE   emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)
	MOVQ  R11, AX

lz4s_s2_lits_emit_done:
	MOVQ R8, DX

lz4s_s2_lits_done:
	CMPQ DX, BX
	JNE  lz4s_s2_match
	CMPQ R10, $0x03
	JEQ  lz4s_s2_done
	JMP  lz4s_s2_corrupt

lz4s_s2_match:
	CMPQ    R10, $0x03
	JEQ     lz4s_s2_loop
	LEAQ    2(DX), R8
	CMPQ    R8, BX
	JAE     lz4s_s2_corrupt
	MOVWQZX (DX), R9
	MOVQ    R8, DX
	TESTQ   R9, R9
	JZ      lz4s_s2_corrupt
	CMPQ    R9, SI
	JA      lz4s_s2_corrupt
	CMPQ    R10, $0x12
	JNE     lz4s_s2_ml_done

lz4s_s2_ml_loop:
	MOVBQZX (DX), R8
	INCQ    DX
	ADDQ    R8, R10
	CMPQ    DX, BX
	JAE     lz4s_s2_corrupt
	CMPQ    R8, $0xff
	JEQ     lz4s_s2_ml_loop

lz4s_s2_ml_done:
	ADDQ R10, SI
	CMPQ R9, DI
	JNE  lz4s_s2_docopy

	// emitRepeat
emit_repeat_again_lz4_s2:
	MOVL R10, R8
	LEAL -4(R10), R10
	CMPL R8, $0x08
	JBE  repeat_two_lz4_s2
	CMPL R8, $0x0c
	JAE  cant_repeat_two_offset_lz4_s2
	CMPL R9, $0x00000800
	JB   repeat_two_offset_lz4_s2

cant_repeat_two_offset_lz4_s2:
	CMPL R10, $0x00000104
	JB   repeat_three_lz4_s2
	CMPL R10, $0x00010100
	JB   repeat_four_lz4_s2
	CMPL R10, $0x0100ffff
	JB   repeat_five_lz4_s2
	LEAL -16842747(R10), R10
	MOVL $0xfffb001d, (AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	JMP  emit_repeat_again_lz4_s2

repeat_five_lz4_s2:
	LEAL -65536(R10), R10
	MOVL R10, R9
	MOVW $0x001d, (AX)
	MOVW R10, 2(AX)
	SARL $0x10, R9
	MOVB R9, 4(AX)
	ADDQ $0x05, AX
	JMP  lz4s_s2_loop

repeat_four_lz4_s2:
	LEAL -256(R10), R10
	MOVW $0x0019, (AX)
	MOVW R10, 2(AX)
	ADDQ $0x04, AX
	JMP  lz4s_s2_loop

repeat_three_lz4_s2:
	LEAL -4(R10), R10
	MOVW $0x0015, (AX)
	MOVB R10, 2(AX)
	ADDQ $0x03, AX
	JMP  lz4s_s2_loop

repeat_two_lz4_s2:
	SHLL $0x02, R10
	ORL  $0x01, R10
	MOVW R10, (AX)
	ADDQ $0x02, AX
	JMP  lz4s_s2_loop

repeat_two_offset_lz4_s2:
	XORQ R8, R8
	LEAL 1(R8)(R10*4), R10
	MOVB R9, 1(AX)
	SARL $0x08, R9
	SHLL $0x05, R9
	ORL  R9, R10
	MOVB R10, (AX)
	ADDQ $0x02, AX
	JMP  lz4s_s2_loop

lz4s_s2_docopy:
	MOVQ R9, DI

	// emitCopy
	CMPL R10, $0x40
	JBE  two_byte_offset_short_lz4_s2
	CMPL R9, $0x00000800
	JAE  long_offset_short_lz4_s2
	MOVL $0x00000001, R8
	LEAL 16(R8), R8
	MOVB R9, 1(AX)
	MOVL R9, R11
	SHRL $0x08, R11
	SHLL $0x05, R11
	ORL  R11, R8
	MOVB R8, (AX)
	ADDQ $0x02, AX
	SUBL $0x08, R10

	// emitRepeat
	LEAL -4(R10), R10
	JMP  cant_repeat_two_offset_lz4_s2_emit_copy_short_2b

emit_repeat_again_lz4_s2_emit_copy_short_2b:
	MOVL R10, R8
	LEAL -4(R10), R10
	CMPL R8, $0x08
	JBE  repeat_two_lz4_s2_emit_copy_short_2b
	CMPL R8, $0x0c
	JAE  cant_repeat_two_offset_lz4_s2_emit_copy_short_2b
	CMPL R9, $0x00000800
	JB   repeat_two_offset_lz4_s2_emit_copy_short_2b

cant_repeat_two_offset_lz4_s2_emit_copy_short_2b:
	CMPL R10, $0x00000104
	JB   repeat_three_lz4_s2_emit_copy_short_2b
	CMPL R10, $0x00010100
	JB   repeat_four_lz4_s2_emit_copy_short_2b
	CMPL R10, $0x0100ffff
	JB   repeat_five_lz4_s2_emit_copy_short_2b
	LEAL -16842747(R10), R10
	MOVL $0xfffb001d, (AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	JMP  emit_repeat_again_lz4_s2_emit_copy_short_2b

repeat_five_lz4_s2_emit_copy_short_2b:
	LEAL -65536(R10), R10
	MOVL R10, R9
	MOVW $0x001d, (AX)
	MOVW R10, 2(AX)
	SARL $0x10, R9
	MOVB R9, 4(AX)
	ADDQ $0x05, AX
	JMP  lz4s_s2_loop

repeat_four_lz4_s2_emit_copy_short_2b:
	LEAL -256(R10), R10
	MOVW $0x0019, (AX)
	MOVW R10, 2(AX)
	ADDQ $0x04, AX
	JMP  lz4s_s2_loop

repeat_three_lz4_s2_emit_copy_short_2b:
	LEAL -4(R10), R10
	MOVW $0x0015, (AX)
	MOVB R10, 2(AX)
	ADDQ $0x03, AX
	JMP  lz4s_s2_loop

repeat_two_lz4_s2_emit_copy_short_2b:
	SHLL $0x02, R10
	ORL  $0x01, R10
	MOVW R10, (AX)
	ADDQ $0x02, AX
	JMP  lz4s_s2_loop

repeat_two_offset_lz4_s2_emit_copy_short_2b:
	XORQ R8, R8
	LEAL 1(R8)(R10*4), R10
	MOVB R9, 1(AX)
	SARL $0x08, R9
	SHLL $0x05, R9
	ORL  R9, R10
	MOVB R10, (AX)
	ADDQ $0x02, AX
	JMP  lz4s_s2_loop

long_offset_short_lz4_s2:
	MOVB $0xee, (AX)
	MOVW R9, 1(AX)
	LEAL -60(R10), R10
	ADDQ $0x03, AX

	// emitRepeat
emit_repeat_again_lz4_s2_emit_copy_short:
	MOVL R10, R8
	LEAL -4(R10), R10
	CMPL R8, $0x08
	JBE  repeat_two_lz4_s2_emit_copy_short
	CMPL R8, $0x0c
	JAE  cant_repeat_two_offset_lz4_s2_emit_copy_short
	CMPL R9, $0x00000800
	JB   repeat_two_offset_lz4_s2_emit_copy_short

cant_repeat_two_offset_lz4_s2_emit_copy_short:
	CMPL R10, $0x00000104
	JB   repeat_three_lz4_s2_emit_copy_short
	CMPL R10, $0x00010100
	JB   repeat_four_lz4_s2_emit_copy_short
	CMPL R10, $0x0100ffff
	JB   repeat_five_lz4_s2_emit_copy_short
	LEAL -16842747(R10), R10
	MOVL $0xfffb001d, (AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	JMP  emit_repeat_again_lz4_s2_emit_copy_short

repeat_five_lz4_s2_emit_copy_short:
	LEAL -65536(R10), R10
	MOVL R10, R9
	MOVW $0x001d, (AX)
	MOVW R10, 2(AX)
	SARL $0x10, R9
	MOVB R9, 4(AX)
	ADDQ $0x05, AX
	JMP  lz4s_s2_loop

repeat_four_lz4_s2_emit_copy_short:
	LEAL -256(R10), R10
	MOVW $0x0019, (AX)
	MOVW R10, 2(AX)
	ADDQ $0x04, AX
	JMP  lz4s_s2_loop

repeat_three_lz4_s2_emit_copy_short:
	LEAL -4(R10), R10
	MOVW $0x0015, (AX)
	MOVB R10, 2(AX)
	ADDQ $0x03, AX
	JMP  lz4s_s2_loop

repeat_two_lz4_s2_emit_copy_short:
	SHLL $0x02, R10
	ORL  $0x01, R10
	MOVW R10, (AX)
	ADDQ $0x02, AX
	JMP  lz4s_s2_loop

repeat_two_offset_lz4_s2_emit_copy_short:
	XORQ R8, R8
	LEAL 1(R8)(R10*4), R10
	MOVB R9, 1(AX)
	SARL $0x08, R9
	SHLL $0x05, R9
	ORL  R9, R10
	MOVB R10, (AX)
	ADDQ $0x02, AX
	JMP  lz4s_s2_loop

two_byte_offset_short_lz4_s2:
	MOVL R10, R8
	SHLL $0x02, R8
	CMPL R10, $0x0c
	JAE  emit_copy_three_lz4_s2
	CMPL R9, $0x00000800
	JAE  emit_copy_three_lz4_s2
	LEAL -15(R8), R8
	MOVB R9, 1(AX)
	SHRL $0x08, R9
	SHLL $0x05, R9
	ORL  R9, R8
	MOVB R8, (AX)
	ADDQ $0x02, AX
	JMP  lz4s_s2_loop

emit_copy_three_lz4_s2:
	LEAL -2(R8), R8
	MOVB R8, (AX)
	MOVW R9, 1(AX)
	ADDQ $0x03, AX
	JMP  lz4s_s2_loop

lz4s_s2_done:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ SI, uncompressed+48(FP)
	MOVQ AX, dstUsed+56(FP)
	RET

lz4s_s2_corrupt:
	XORQ AX, AX
	LEAQ -1(AX), SI
	MOVQ SI, uncompressed+48(FP)
	RET

lz4s_s2_dstfull:
	XORQ AX, AX
	LEAQ -2(AX), SI
	MOVQ SI, uncompressed+48(FP)
	RET

// func cvtLZ4BlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
// Requires: SSE2
TEXT ·cvtLZ4BlockSnappyAsm(SB), NOSPLIT, $0-64
	XORQ SI, SI
	MOVQ dst_base+0(FP), AX
	MOVQ dst_len+8(FP), CX
	MOVQ src_base+24(FP), DX
	MOVQ src_len+32(FP), BX
	LEAQ (DX)(BX*1), BX
	LEAQ -10(AX)(CX*1), CX

lz4_snappy_loop:
	CMPQ    DX, BX
	JAE     lz4_snappy_corrupt
	CMPQ    AX, CX
	JAE     lz4_snappy_dstfull
	MOVBQZX (DX), DI
	MOVQ    DI, R8
	MOVQ    DI, R9
	SHRQ    $0x04, R8
	ANDQ    $0x0f, R9
	CMPQ    DI, $0xf0
	JB      lz4_snappy_ll_end

lz4_snappy_ll_loop:
	INCQ    DX
	CMPQ    DX, BX
	JAE     lz4_snappy_corrupt
	MOVBQZX (DX), DI
	ADDQ    DI, R8
	CMPQ    DI, $0xff
	JEQ     lz4_snappy_ll_loop

lz4_snappy_ll_end:
	LEAQ  (DX)(R8*1), DI
	ADDQ  $0x04, R9
	CMPQ  DI, BX
	JAE   lz4_snappy_corrupt
	INCQ  DX
	INCQ  DI
	TESTQ R8, R8
	JZ    lz4_snappy_lits_done
	LEAQ  (AX)(R8*1), R10
	CMPQ  R10, CX
	JAE   lz4_snappy_dstfull
	ADDQ  R8, SI
	LEAL  -1(R8), R10
	CMPL  R10, $0x3c
	JB    one_byte_lz4_snappy
	CMPL  R10, $0x00000100
	JB    two_bytes_lz4_snappy
	CMPL  R10, $0x00010000
	JB    three_bytes_lz4_snappy
	CMPL  R10, $0x01000000
	JB    four_bytes_lz4_snappy
	MOVB  $0xfc, (AX)
	MOVL  R10, 1(AX)
	ADDQ  $0x05, AX
	JMP   memmove_long_lz4_snappy

four_bytes_lz4_snappy:
	MOVL R10, R11
	SHRL $0x10, R11
	MOVB $0xf8, (AX)
	MOVW R10, 1(AX)
	MOVB R11, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_lz4_snappy

three_bytes_lz4_snappy:
	MOVB $0xf4, (AX)
	MOVW R10, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_lz4_snappy

two_bytes_lz4_snappy:
	MOVB $0xf0, (AX)
	MOVB R10, 1(AX)
	ADDQ $0x02, AX
	CMPL R10, $0x40
	JB   memmove_lz4_snappy
	JMP  memmove_long_lz4_snappy

one_byte_lz4_snappy:
	SHLB $0x02, R10
	MOVB R10, (AX)
	ADDQ $0x01, AX

memmove_lz4_snappy:
	LEAQ (AX)(R8*1), R10

	// genMemMoveShort
	CMPQ R8, $0x08
	JBE  emit_lit_memmove_lz4_snappy_memmove_move_8
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_lz4_snappy_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_lz4_snappy_memmove_move_17through32
	JMP  emit_lit_memmove_lz4_snappy_memmove_move_33through64

emit_lit_memmove_lz4_snappy_memmove_move_8:
	MOVQ (DX), R11
	MOVQ R11, (AX)
	JMP  memmove_end_copy_lz4_snappy

emit_lit_memmove_lz4_snappy_memmove_move_8through16:
	MOVQ (DX), R11
	MOVQ -8(DX)(R8*1), DX
	MOVQ R11, (AX)
	MOVQ DX, -8(AX)(R8*1)
	JMP  memmove_end_copy_lz4_snappy

emit_lit_memmove_lz4_snappy_memmove_move_17through32:
	MOVOU (DX), X0
	MOVOU -16(DX)(R8*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R8*1)
	JMP   memmove_end_copy_lz4_snappy

emit_lit_memmove_lz4_snappy_memmove_move_33through64:
	MOVOU (DX), X0
	MOVOU 16(DX), X1
	MOVOU -32(DX)(R8*1), X2
	MOVOU -16(DX)(R8*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)

memmove_end_copy_lz4_snappy:
	MOVQ R10, AX
	JMP  lz4_snappy_lits_emit_done

memmove_long_lz4_snappy:
	LEAQ (AX)(R8*1), R10

	// genMemMoveLong
	MOVOU (DX), X0
	MOVOU 16(DX), X1
	MOVOU -32(DX)(R8*1), X2
	MOVOU -16(DX)(R8*1), X3
	MOVQ  R8, R12
	SHRQ  $0x05, R12
	MOVQ  AX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R13
	SUBQ  R11, R13
	DECQ  R12
	JA    emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32
	LEAQ  -32(DX)(R13*1), R11
	LEAQ  -32(AX)(R13*1), R14

emit_lit_memmove_long_lz4_snappylarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R14)
	MOVOA X5, 16(R14)
	ADDQ  $0x20, R14
	ADDQ  $0x20, R11
	ADDQ  $0x20, R13
	DECQ  R12
	JNA   emit_lit_memmove_long_lz4_snappylarge_big_loop_back

emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32:
	MOVOU -32(DX)(R13*1), X4
	MOVOU -16(DX)(R13*1), X5
	MOVOA X4, -32(AX)(R13*1)
	MOVOA X5, -16(AX)(R13*1)
	ADDQ  $0x20, R13
	CMPQ  R8, R13
	JAE   emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)
	MOVQ  R10, AX

lz4_snappy_lits_emit_done:
	MOVQ DI, DX

lz4_snappy_lits_done:
	CMPQ DX, BX
	JNE  lz4_snappy_match
	CMPQ R9, $0x04
	JEQ  lz4_snappy_done
	JMP  lz4_snappy_corrupt

lz4_snappy_match:
	LEAQ    2(DX), DI
	CMPQ    DI, BX
	JAE     lz4_snappy_corrupt
	MOVWQZX (DX), R8
	MOVQ    DI, DX
	TESTQ   R8, R8
	JZ      lz4_snappy_corrupt
	CMPQ    R8, SI
	JA      lz4_snappy_corrupt
	CMPQ    R9, $0x13
	JNE     lz4_snappy_ml_done

lz4_snappy_ml_loop:
	MOVBQZX (DX), DI
	INCQ    DX
	ADDQ    DI, R9
	CMPQ    DX, BX
	JAE     lz4_snappy_corrupt
	CMPQ    DI, $0xff
	JEQ     lz4_snappy_ml_loop

lz4_snappy_ml_done:
	ADDQ R9, SI

	// emitCopy
two_byte_offset_lz4_s2:
	CMPL R9, $0x40
	JBE  two_byte_offset_short_lz4_s2
	MOVB $0xee, (AX)
	MOVW R8, 1(AX)
	LEAL -60(R9), R9
	ADDQ $0x03, AX
	CMPQ AX, CX
	JAE  lz4_snappy_loop
	JMP  two_byte_offset_lz4_s2

two_byte_offset_short_lz4_s2:
	MOVL R9, DI
	SHLL $0x02, DI
	CMPL R9, $0x0c
	JAE  emit_copy_three_lz4_s2
	CMPL R8, $0x00000800
	JAE  emit_copy_three_lz4_s2
	LEAL -15(DI), DI
	MOVB R8, 1(AX)
	SHRL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, DI
	MOVB DI, (AX)
	ADDQ $0x02, AX
	JMP  lz4_snappy_loop

emit_copy_three_lz4_s2:
	LEAL -2(DI), DI
	MOVB DI, (AX)
	MOVW R8, 1(AX)
	ADDQ $0x03, AX
	JMP  lz4_snappy_loop

lz4_snappy_done:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ SI, uncompressed+48(FP)
	MOVQ AX, dstUsed+56(FP)
	RET

lz4_snappy_corrupt:
	XORQ AX, AX
	LEAQ -1(AX), SI
	MOVQ SI, uncompressed+48(FP)
	RET

lz4_snappy_dstfull:
	XORQ AX, AX
	LEAQ -2(AX), SI
	MOVQ SI, uncompressed+48(FP)
	RET

// func cvtLZ4sBlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
// Requires: SSE2
TEXT ·cvtLZ4sBlockSnappyAsm(SB), NOSPLIT, $0-64
	XORQ SI, SI
	MOVQ dst_base+0(FP), AX
	MOVQ dst_len+8(FP), CX
	MOVQ src_base+24(FP), DX
	MOVQ src_len+32(FP), BX
	LEAQ (DX)(BX*1), BX
	LEAQ -10(AX)(CX*1), CX

lz4s_snappy_loop:
	CMPQ    DX, BX
	JAE     lz4s_snappy_corrupt
	CMPQ    AX, CX
	JAE     lz4s_snappy_dstfull
	MOVBQZX (DX), DI
	MOVQ    DI, R8
	MOVQ    DI, R9
	SHRQ    $0x04, R8
	ANDQ    $0x0f, R9
	CMPQ    DI, $0xf0
	JB      lz4s_snappy_ll_end

lz4s_snappy_ll_loop:
	INCQ    DX
	CMPQ    DX, BX
	JAE     lz4s_snappy_corrupt
	MOVBQZX (DX), DI
	ADDQ    DI, R8
	CMPQ    DI, $0xff
	JEQ     lz4s_snappy_ll_loop

lz4s_snappy_ll_end:
	LEAQ  (DX)(R8*1), DI
	ADDQ  $0x03, R9
	CMPQ  DI, BX
	JAE   lz4s_snappy_corrupt
	INCQ  DX
	INCQ  DI
	TESTQ R8, R8
	JZ    lz4s_snappy_lits_done
	LEAQ  (AX)(R8*1), R10
	CMPQ  R10, CX
	JAE   lz4s_snappy_dstfull
	ADDQ  R8, SI
	LEAL  -1(R8), R10
	CMPL  R10, $0x3c
	JB    one_byte_lz4s_snappy
	CMPL  R10, $0x00000100
	JB    two_bytes_lz4s_snappy
	CMPL  R10, $0x00010000
	JB    three_bytes_lz4s_snappy
	CMPL  R10, $0x01000000
	JB    four_bytes_lz4s_snappy
	MOVB  $0xfc, (AX)
	MOVL  R10, 1(AX)
	ADDQ  $0x05, AX
	JMP   memmove_long_lz4s_snappy

four_bytes_lz4s_snappy:
	MOVL R10, R11
	SHRL $0x10, R11
	MOVB $0xf8, (AX)
	MOVW R10, 1(AX)
	MOVB R11, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_lz4s_snappy

three_bytes_lz4s_snappy:
	MOVB $0xf4, (AX)
	MOVW R10, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_lz4s_snappy

two_bytes_lz4s_snappy:
	MOVB $0xf0, (AX)
	MOVB R10, 1(AX)
	ADDQ $0x02, AX
	CMPL R10, $0x40
	JB   memmove_lz4s_snappy
	JMP  memmove_long_lz4s_snappy

one_byte_lz4s_snappy:
	SHLB $0x02, R10
	MOVB R10, (AX)
	ADDQ $0x01, AX

memmove_lz4s_snappy:
	LEAQ (AX)(R8*1), R10

	// genMemMoveShort
	CMPQ R8, $0x08
	JBE  emit_lit_memmove_lz4s_snappy_memmove_move_8
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_lz4s_snappy_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_lz4s_snappy_memmove_move_17through32
	JMP  emit_lit_memmove_lz4s_snappy_memmove_move_33through64

emit_lit_memmove_lz4s_snappy_memmove_move_8:
	MOVQ (DX), R11
	MOVQ R11, (AX)
	JMP  memmove_end_copy_lz4s_snappy

emit_lit_memmove_lz4s_snappy_memmove_move_8through16:
	MOVQ (DX), R11
	MOVQ -8(DX)(R8*1), DX
	MOVQ R11, (AX)
	MOVQ DX, -8(AX)(R8*1)
	JMP  memmove_end_copy_lz4s_snappy

emit_lit_memmove_lz4s_snappy_memmove_move_17through32:
	MOVOU (DX), X0
	MOVOU -16(DX)(R8*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R8*1)
	JMP   memmove_end_copy_lz4s_snappy

emit_lit_memmove_lz4s_snappy_memmove_move_33through64:
	MOVOU (DX), X0
	MOVOU 16(DX), X1
	MOVOU -32(DX)(R8*1), X2
	MOVOU -16(DX)(R8*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)

memmove_end_copy_lz4s_snappy:
	MOVQ R10, AX
	JMP  lz4s_snappy_lits_emit_done

memmove_long_lz4s_snappy:
	LEAQ (AX)(R8*1), R10

	// genMemMoveLong
	MOVOU (DX), X0
	MOVOU 16(DX), X1
	MOVOU -32(DX)(R8*1), X2
	MOVOU -16(DX)(R8*1), X3
	MOVQ  R8, R12
	SHRQ  $0x05, R12
	MOVQ  AX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R13
	SUBQ  R11, R13
	DECQ  R12
	JA    emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32
	LEAQ  -32(DX)(R13*1), R11
	LEAQ  -32(AX)(R13*1), R14

emit_lit_memmove_long_lz4s_snappylarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R14)
	MOVOA X5, 16(R14)
	ADDQ  $0x20, R14
	ADDQ  $0x20, R11
	ADDQ  $0x20, R13
	DECQ  R12
	JNA   emit_lit_memmove_long_lz4s_snappylarge_big_loop_back

emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32:
	MOVOU -32(DX)(R13*1), X4
	MOVOU -16(DX)(R13*1), X5
	MOVOA X4, -32(AX)(R13*1)
	MOVOA X5, -16(AX)(R13*1)
	ADDQ  $0x20, R13
	CMPQ  R8, R13
	JAE   emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)
	MOVQ  R10, AX

lz4s_snappy_lits_emit_done:
	MOVQ DI, DX

lz4s_snappy_lits_done:
	CMPQ DX, BX
	JNE  lz4s_snappy_match
	CMPQ R9, $0x03
	JEQ  lz4s_snappy_done
	JMP  lz4s_snappy_corrupt

lz4s_snappy_match:
	CMPQ    R9, $0x03
	JEQ     lz4s_snappy_loop
	LEAQ    2(DX), DI
	CMPQ    DI, BX
	JAE     lz4s_snappy_corrupt
	MOVWQZX (DX), R8
	MOVQ    DI, DX
	TESTQ   R8, R8
	JZ      lz4s_snappy_corrupt
	CMPQ    R8, SI
	JA      lz4s_snappy_corrupt
	CMPQ    R9, $0x12
	JNE     lz4s_snappy_ml_done

lz4s_snappy_ml_loop:
	MOVBQZX (DX), DI
	INCQ    DX
	ADDQ    DI, R9
	CMPQ    DX, BX
	JAE     lz4s_snappy_corrupt
	CMPQ    DI, $0xff
	JEQ     lz4s_snappy_ml_loop

lz4s_snappy_ml_done:
	ADDQ R9, SI

	// emitCopy
two_byte_offset_lz4_s2:
	CMPL R9, $0x40
	JBE  two_byte_offset_short_lz4_s2
	MOVB $0xee, (AX)
	MOVW R8, 1(AX)
	LEAL -60(R9), R9
	ADDQ $0x03, AX
	CMPQ AX, CX
	JAE  lz4s_snappy_loop
	JMP  two_byte_offset_lz4_s2

two_byte_offset_short_lz4_s2:
	MOVL R9, DI
	SHLL $0x02, DI
	CMPL R9, $0x0c
	JAE  emit_copy_three_lz4_s2
	CMPL R8, $0x00000800
	JAE  emit_copy_three_lz4_s2
	LEAL -15(DI), DI
	MOVB R8, 1(AX)
	SHRL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, DI
	MOVB DI, (AX)
	ADDQ $0x02, AX
	JMP  lz4s_snappy_loop

emit_copy_three_lz4_s2:
	LEAL -2(DI), DI
	MOVB DI, (AX)
	MOVW R8, 1(AX)
	ADDQ $0x03, AX
	JMP  lz4s_snappy_loop

lz4s_snappy_done:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ SI, uncompressed+48(FP)
	MOVQ AX, dstUsed+56(FP)
	RET

lz4s_snappy_corrupt:
	XORQ AX, AX
	LEAQ -1(AX), SI
	MOVQ SI, uncompressed+48(FP)
	RET

lz4s_snappy_dstfull:
	XORQ AX, AX
	LEAQ -2(AX), SI
	MOVQ SI, uncompressed+48(FP)
	RET