Commit 886729454f1 for php.net

commit 886729454f1a551f47a7316d33f36093e71cb43d
Author: Dmitry Stogov <dmitry@php.net>
Date:   Mon Dec 15 20:13:03 2025 +0300

    Update IR (#20710)

    IR commit: 3d72a7295c77743da22b36bab808ebb5f564488d

diff --git a/ext/opcache/jit/ir/ir.c b/ext/opcache/jit/ir/ir.c
index 81621ce11bd..745a66b2163 100644
--- a/ext/opcache/jit/ir/ir.c
+++ b/ext/opcache/jit/ir/ir.c
@@ -118,7 +118,7 @@ void ir_print_const(const ir_ctx *ctx, const ir_insn *insn, FILE *f, bool quoted
 {
 	char buf[128];

-	if (insn->op == IR_FUNC || insn->op == IR_SYM) {
+	if (insn->op == IR_FUNC || insn->op == IR_SYM || insn->op == IR_LABEL) {
 		fprintf(f, "%s", ir_get_str(ctx, insn->val.name));
 		return;
 	} else if (insn->op == IR_STR) {
@@ -290,6 +290,7 @@ void ir_print_const(const ir_ctx *ctx, const ir_insn *insn, FILE *f, bool quoted
 #define ir_op_kind_prb     IR_OPND_PROB
 #define ir_op_kind_opt     IR_OPND_PROB
 #define ir_op_kind_pro     IR_OPND_PROTO
+#define ir_op_kind_lbl     IR_OPND_LABEL_REF

 #define _IR_OP_FLAGS(name, flags, op1, op2, op3) \
 	IR_OP_FLAGS(ir_op_flag_ ## flags, ir_op_kind_ ## op1, ir_op_kind_ ## op2, ir_op_kind_ ## op3),
@@ -689,6 +690,13 @@ ir_ref ir_const_str(ir_ctx *ctx, ir_ref str)
 	return ir_const_ex(ctx, val, IR_ADDR, IR_OPTX(IR_STR, IR_ADDR, 0));
 }

+ir_ref ir_const_label(ir_ctx *ctx, ir_ref str)
+{
+	ir_val val;
+	val.u64 = str;
+	return ir_const_ex(ctx, val, IR_ADDR, IR_OPTX(IR_LABEL, IR_ADDR, 0));
+}
+
 ir_ref ir_str(ir_ctx *ctx, const char *s)
 {
 	size_t len;
@@ -879,6 +887,17 @@ static ir_ref _ir_fold_cse(ir_ctx *ctx, uint32_t opt, ir_ref op1, ir_ref op2, ir
 	return IR_UNUSED;
 }

+IR_ALWAYS_INLINE ir_ref _ir_fold_cast(ir_ctx *ctx, ir_ref ref, ir_type type)
+{
+	if (ctx->ir_base[ref].type == type) {
+		return ref;
+	} else if (IR_IS_CONST_REF(ref) && !IR_IS_SYM_CONST(ctx->ir_base[ref].op)) {
+		return ir_const(ctx, ctx->ir_base[ref].val, type);
+	} else {
+		return ir_emit1(ctx, IR_OPT(IR_BITCAST, type), ref);
+	}
+}
+
 #define IR_FOLD(X)        IR_FOLD1(X, __LINE__)
 #define IR_FOLD1(X, Y)    IR_FOLD2(X, Y)
 #define IR_FOLD2(X, Y)    case IR_RULE_ ## Y:
@@ -1158,7 +1177,7 @@ ir_ref ir_bind(ir_ctx *ctx, ir_ref var, ir_ref def)
 	IR_ASSERT(var < 0);
 	if (!ir_hashtab_add(ctx->binding, def, var)) {
 		/* Add a copy with different binding */
-		def = ir_emit2(ctx, IR_OPT(IR_COPY, ctx->ir_base[def].type), def, 1);
+		def = ir_emit2(ctx, IR_OPT(IR_COPY, ctx->ir_base[def].type), def, IR_COPY_HARD);
 		ir_hashtab_add(ctx->binding, def, var);
 	}
 	return def;
@@ -1836,8 +1855,49 @@ int ir_mem_flush(void *ptr, size_t size)
 	return 1;
 }
 #else
+
+#if defined(__linux__) && defined(__x86_64__) && defined(PKEY_DISABLE_WRITE)
+# define HAVE_PKEY_MPROTECT 1
+#endif
+
+#ifdef HAVE_PKEY_MPROTECT
+
+#ifndef PKEY_DISABLE_EXECUTE
+# define PKEY_DISABLE_EXECUTE 0
+#endif
+
+int pkey_mprotect(void* addr, size_t len, int prot, int pkey) __attribute__((weak));
+int pkey_alloc(unsigned int, unsigned int) __attribute__((weak));
+int pkey_free(int) __attribute__((weak));
+int pkey_set(int, unsigned) __attribute__((weak));
+
+static int ir_pkey = 0;
+#endif
+
 void *ir_mem_mmap(size_t size)
 {
+#ifdef HAVE_PKEY_MPROTECT
+	if (!ir_pkey && pkey_mprotect) {
+		int key = pkey_alloc(0, PKEY_DISABLE_WRITE);
+		if (key > 0) {
+			ir_pkey = key;
+		}
+	}
+	if (ir_pkey > 0) {
+		void *ret = mmap(NULL, size, PROT_EXEC|PROT_READ|PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+		if (ret == MAP_FAILED) {
+			return NULL;
+		}
+		if (pkey_mprotect(ret, size, PROT_EXEC|PROT_READ|PROT_WRITE, ir_pkey) != 0) {
+#ifdef IR_DEBUG
+			fprintf(stderr, "pkey_mprotect() failed\n");
+#endif
+			munmap(ret, size);
+			return NULL;
+		}
+		return ret;
+	}
+#endif
 	int prot_flags = PROT_EXEC;
 #if defined(__NetBSD__)
 	prot_flags |= PROT_MPROTECT(PROT_READ|PROT_WRITE);
@@ -1852,11 +1912,28 @@ void *ir_mem_mmap(size_t size)
 int ir_mem_unmap(void *ptr, size_t size)
 {
 	munmap(ptr, size);
+#ifdef HAVE_PKEY_MPROTECT
+//	if (ir_pkey > 0) {
+//		pkey_free(ir_pkey);
+//		ir_pkey = 0;
+//	}
+#endif
 	return 1;
 }

 int ir_mem_protect(void *ptr, size_t size)
 {
+#ifdef HAVE_PKEY_MPROTECT
+	if (ir_pkey > 0) {
+		if (pkey_set(ir_pkey, PKEY_DISABLE_WRITE)) {
+#ifdef IR_DEBUG
+			fprintf(stderr, "mprotect() failed\n");
+#endif
+			return 0;
+		}
+		return 1;
+	}
+#endif
 	if (mprotect(ptr, size, PROT_READ | PROT_EXEC) != 0) {
 #ifdef IR_DEBUG
 		fprintf(stderr, "mprotect() failed\n");
@@ -1868,6 +1945,17 @@ int ir_mem_protect(void *ptr, size_t size)

 int ir_mem_unprotect(void *ptr, size_t size)
 {
+#ifdef HAVE_PKEY_MPROTECT
+	if (ir_pkey > 0) {
+		if (pkey_set(ir_pkey, PKEY_DISABLE_EXECUTE)) {
+#ifdef IR_DEBUG
+			fprintf(stderr, "mprotect() failed\n");
+#endif
+			return 0;
+		}
+		return 1;
+	}
+#endif
 	if (mprotect(ptr, size, PROT_READ | PROT_WRITE) != 0) {
 #ifdef IR_DEBUG
 		fprintf(stderr, "mprotect() failed\n");
@@ -2070,7 +2158,26 @@ IR_ALWAYS_INLINE ir_ref ir_find_aliasing_load_i(ir_ctx *ctx, ir_ref ref, ir_type
 			}
 		} else if (insn->op == IR_RSTORE) {
 			modified_regset |= (1 << insn->op3);
-		} else if (insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN || insn->op == IR_CALL || insn->op == IR_VSTORE) {
+		} else if (insn->op == IR_CALL) {
+			ir_insn *func = &ctx->ir_base[insn->op2];
+			ir_ref func_proto;
+			const ir_proto_t *proto;
+
+			if (func->op == IR_FUNC || func->op == IR_FUNC_ADDR) {
+				func_proto = func->proto;
+			} else if (func->op == IR_PROTO) {
+				func_proto = func->op2;
+			} else {
+				break;
+			}
+			if (!func_proto) {
+				break;
+			}
+			proto = (const ir_proto_t *)ir_get_str(ctx, func_proto);
+			if (!(proto->flags & (IR_CONST_FUNC|IR_PURE_FUNC))) {
+				break;
+			}
+		} else if (insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN || insn->op == IR_VSTORE) {
 			return IR_UNUSED;
 		}
 		ref = insn->op1;
@@ -2116,7 +2223,26 @@ IR_ALWAYS_INLINE ir_ref ir_find_aliasing_vload_i(ir_ctx *ctx, ir_ref ref, ir_typ
 					break;
 				}
 			}
-		} else if (insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN || insn->op == IR_CALL || insn->op == IR_STORE) {
+		} else if (insn->op == IR_CALL) {
+			ir_insn *func = &ctx->ir_base[insn->op2];
+			ir_ref func_proto;
+			const ir_proto_t *proto;
+
+			if (func->op == IR_FUNC || func->op == IR_FUNC_ADDR) {
+				func_proto = func->proto;
+			} else if (func->op == IR_PROTO) {
+				func_proto = func->op2;
+			} else {
+				break;
+			}
+			if (!func_proto) {
+				break;
+			}
+			proto = (const ir_proto_t *)ir_get_str(ctx, func_proto);
+			if (!(proto->flags & (IR_CONST_FUNC|IR_PURE_FUNC))) {
+				break;
+			}
+		} else if (insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN || insn->op == IR_STORE) {
 			break;
 		}
 		ref = insn->op1;
@@ -3013,6 +3139,16 @@ void _ir_IJMP(ir_ctx *ctx, ir_ref addr)
 	ctx->control = IR_UNUSED;
 }

+ir_ref _ir_IGOTO(ir_ctx *ctx, ir_ref addr)
+{
+	ir_ref ref;
+
+	IR_ASSERT(ctx->control);
+	ctx->control = ref = ir_emit2(ctx, IR_IGOTO, ctx->control, addr);
+	ctx->control = IR_UNUSED;
+	return ref;
+}
+
 ir_ref _ir_ADD_OFFSET(ir_ctx *ctx, ir_ref addr, uintptr_t offset)
 {
 	if (offset) {
@@ -3135,6 +3271,18 @@ void _ir_VSTORE(ir_ctx *ctx, ir_ref var, ir_ref val)
 	ctx->control = ir_emit3(ctx, IR_VSTORE, ctx->control, var, val);
 }

+ir_ref _ir_VLOAD_v(ir_ctx *ctx, ir_type type, ir_ref var)
+{
+	IR_ASSERT(ctx->control);
+	return ctx->control = ir_emit2(ctx, IR_OPT(IR_VLOAD_v, type), ctx->control, var);
+}
+
+void _ir_VSTORE_v(ir_ctx *ctx, ir_ref var, ir_ref val)
+{
+	IR_ASSERT(ctx->control);
+	ctx->control = ir_emit3(ctx, IR_VSTORE_v, ctx->control, var, val);
+}
+
 ir_ref _ir_TLS(ir_ctx *ctx, ir_ref index, ir_ref offset)
 {
 	IR_ASSERT(ctx->control);
@@ -3193,6 +3341,18 @@ void _ir_STORE(ir_ctx *ctx, ir_ref addr, ir_ref val)
 	ctx->control = ir_emit3(ctx, IR_STORE, ctx->control, addr, val);
 }

+ir_ref _ir_LOAD_v(ir_ctx *ctx, ir_type type, ir_ref addr)
+{
+	IR_ASSERT(ctx->control);
+	return ctx->control = ir_emit2(ctx, IR_OPT(IR_LOAD_v, type), ctx->control, addr);
+}
+
+void _ir_STORE_v(ir_ctx *ctx, ir_ref addr, ir_ref val)
+{
+	IR_ASSERT(ctx->control);
+	ctx->control = ir_emit3(ctx, IR_STORE_v, ctx->control, addr, val);
+}
+
 void _ir_VA_START(ir_ctx *ctx, ir_ref list)
 {
 	IR_ASSERT(ctx->control);
@@ -3217,11 +3377,13 @@ ir_ref _ir_VA_ARG(ir_ctx *ctx, ir_type type, ir_ref list)
 	return ctx->control = ir_emit2(ctx, IR_OPT(IR_VA_ARG, type), ctx->control, list);
 }

-ir_ref _ir_VA_ARG_EX(ir_ctx *ctx, ir_type type, ir_ref list, size_t size)
+ir_ref _ir_VA_ARG_EX(ir_ctx *ctx, ir_type type, ir_ref list, size_t size, size_t align)
 {
 	IR_ASSERT(ctx->control);
-	IR_ASSERT(size <= 0x7fffffff);
-	return ctx->control = ir_emit3(ctx, IR_OPT(IR_VA_ARG, type), ctx->control, list, (ir_ref)size);
+	IR_ASSERT(size <= 0x0fffffff);
+	IR_ASSERT(align != 0 && ((align & (align - 1)) == 0) && align <= 128);
+	return ctx->control = ir_emit3(ctx, IR_OPT(IR_VA_ARG, type), ctx->control, list,
+		(ir_ref)IR_VA_ARG_OP3(size, align));
 }

 ir_ref _ir_BLOCK_BEGIN(ir_ctx *ctx)
diff --git a/ext/opcache/jit/ir/ir.h b/ext/opcache/jit/ir/ir.h
index 8fcfbffa7d6..a9665059705 100644
--- a/ext/opcache/jit/ir/ir.h
+++ b/ext/opcache/jit/ir/ir.h
@@ -216,6 +216,7 @@ typedef enum _ir_type {
  * prb - branch probability 1-99 (0 - unspecified): (IF_TRUE, IF_FALSE, CASE_VAL, CASE_DEFAULT)
  * opt - optional number
  * pro - function prototype
+ * lbl - label used as value (a reference to constant): (BEGIN)
  *
  * The order of IR opcodes is carefully selected for efficient folding.
  * - foldable instruction go first
@@ -322,6 +323,7 @@ typedef enum _ir_type {
 	_(FUNC_ADDR,    r0,   ___, ___, ___) /* constant func ref           */ \
 	_(FUNC,         r0,   ___, ___, ___) /* constant func ref           */ \
 	_(SYM,          r0,   ___, ___, ___) /* constant symbol ref         */ \
+	_(LABEL,        r0,   ___, ___, ___) /* label address ref           */ \
 	_(STR,          r0,   ___, ___, ___) /* constant str ref            */ \
 	\
 	/* call ops                                                         */ \
@@ -334,11 +336,15 @@ typedef enum _ir_type {
 	_(BLOCK_BEGIN,  a1,   src, ___, ___) /* stacksave                   */ \
 	_(BLOCK_END,    a2,   src, def, ___) /* stackrestore                */ \
 	_(VLOAD,        l2,   src, var, ___) /* load value of local var     */ \
+	_(VLOAD_v,      l2,   src, var, ___) /* volatile variant of VLOAD   */ \
 	_(VSTORE,       s3,   src, var, def) /* store value to local var    */ \
+	_(VSTORE_v,     s3,   src, var, def) /* volatile variant of VSTORE  */ \
 	_(RLOAD,        l1X2, src, num, opt) /* load value from register    */ \
 	_(RSTORE,       s2X1, src, def, num) /* store value into register   */ \
 	_(LOAD,         l2,   src, ref, ___) /* load from memory            */ \
+	_(LOAD_v,       l2,   src, ref, ___) /* volatile variant of VLOAD   */ \
 	_(STORE,        s3,   src, ref, def) /* store to memory             */ \
+	_(STORE_v,      s3,   src, ref, def) /* volatile variant of VSTORE  */ \
 	_(TLS,          l1X2, src, num, num) /* thread local variable       */ \
 	_(TRAP,         x1,   src, ___, ___) /* DebugBreak                  */ \
 	/* memory reference ops (A, H, U, S, TMP, STR, NEW, X, V) ???       */ \
@@ -360,7 +366,7 @@ typedef enum _ir_type {
 	/* control-flow nodes                                               */ \
 	_(START,        S0X1, ret, ___, ___) /* function start              */ \
 	_(ENTRY,        S1X1, src, num, ___) /* entry with a fake src edge  */ \
-	_(BEGIN,        S1,   src, ___, ___) /* block start                 */ \
+	_(BEGIN,        S1X1, src, lbl, ___) /* block start, optional &&lbl */ \
 	_(IF_TRUE,      S1X1, src, prb, ___) /* IF TRUE proj.               */ \
 	_(IF_FALSE,     S1X1, src, prb, ___) /* IF FALSE proj.              */ \
 	_(CASE_VAL,     S2X1, src, def, prb) /* switch proj.                */ \
@@ -372,8 +378,9 @@ typedef enum _ir_type {
 	_(LOOP_END,     E1,   src, ___, ___) /* loop end                    */ \
 	_(IF,           E2,   src, def, ___) /* conditional control split   */ \
 	_(SWITCH,       E2,   src, def, ___) /* multi-way control split     */ \
+	_(IGOTO,        E2,   src, def, ___) /* computed goto (internal)    */ \
+	_(IJMP,         T2X1, src, def, ret) /* computed goto (terminating) */ \
 	_(RETURN,       T2X1, src, def, ret) /* function return             */ \
-	_(IJMP,         T2X1, src, def, ret) /* computed goto               */ \
 	_(UNREACHABLE,  T1X2, src, ___, ret) /* unreachable (tailcall, etc) */ \
 	\
 	/* deoptimization helper                                            */ \
@@ -400,6 +407,13 @@ typedef enum _ir_op {
 #define IR_OPTX(op, type, n) ((uint32_t)(op) | ((uint32_t)(type) << IR_OPT_TYPE_SHIFT) | ((uint32_t)(n) << IR_OPT_INPUTS_SHIFT))
 #define IR_OPT_TYPE(opt)     (((opt) & IR_OPT_TYPE_MASK) >> IR_OPT_TYPE_SHIFT)

+/* "opt" modifiers */
+#define IR_COPY_HARD         (1<<0)
+
+#define IR_VA_ARG_SIZE(op3)  (((uint32_t)(op3) >> 3))
+#define IR_VA_ARG_ALIGN(op3) (1U << ((uint32_t)(op3) & 0x7))
+#define IR_VA_ARG_OP3(s, a)  (((s) << 3) | ir_ntzl(a))
+
 /* IR References */
 typedef int32_t ir_ref;

@@ -533,6 +547,9 @@ void ir_strtab_free(ir_strtab *strtab);
 #define IR_EXTERN              (1<<5)
 #define IR_CONST               (1<<6)

+#define IR_CONST_FUNC          (1<<6)
+#define IR_PURE_FUNC           (1<<7)
+
 #define IR_INITIALIZED         (1<<7) /* sym data flag: constant or an initialized variable */
 #define IR_CONST_STRING        (1<<8) /* sym data flag: constant string */

@@ -648,7 +665,6 @@ struct _ir_ctx {
 		ir_ref         vars;                    /* list of VARs (used by register allocator) */
 	};
 	ir_snapshot_create_t   snapshot_create;
-	int32_t            stack_frame_alignment;
 	int32_t            stack_frame_size;        /* spill stack frame size (used by register allocator and code generator) */
 	int32_t            call_stack_size;         /* stack for parameter passing (used by register allocator and code generator) */
 	uint64_t           used_preserved_regs;
@@ -698,6 +714,7 @@ ir_ref ir_const_func_addr(ir_ctx *ctx, uintptr_t c, ir_ref proto);
 ir_ref ir_const_func(ir_ctx *ctx, ir_ref str, ir_ref proto);
 ir_ref ir_const_sym(ir_ctx *ctx, ir_ref str);
 ir_ref ir_const_str(ir_ctx *ctx, ir_ref str);
+ir_ref ir_const_label(ir_ctx *ctx, ir_ref str);

 ir_ref ir_unique_const_addr(ir_ctx *ctx, uintptr_t c);

@@ -893,6 +910,7 @@ struct _ir_loader {
 	void*(*resolve_sym_name)  (ir_loader *loader, const char *name, uint32_t flags);
 	bool (*has_sym)           (ir_loader *loader, const char *name);
 	bool (*add_sym)           (ir_loader *loader, const char *name, void *addr);
+	bool (*add_label)         (ir_loader *loader, const char *name, void *addr);
 };

 void ir_loader_init(void);
diff --git a/ext/opcache/jit/ir/ir_aarch64.dasc b/ext/opcache/jit/ir/ir_aarch64.dasc
index 12c3694d469..b553243309f 100644
--- a/ext/opcache/jit/ir/ir_aarch64.dasc
+++ b/ext/opcache/jit/ir/ir_aarch64.dasc
@@ -218,6 +218,7 @@ typedef struct _ir_backend_data {
 	dasm_State        *dasm_state;
 	ir_bitset          emit_constants;
 	int                rodata_label, jmp_table_label;
+	bool               resolved_label_syms;
 } ir_backend_data;

 #define IR_GP_REG_NAME(code, name64, name32) \
@@ -315,6 +316,7 @@ const char *ir_reg_name(int8_t reg, ir_type type)
 	_(RETURN_VOID)         \
 	_(RETURN_INT)          \
 	_(RETURN_FP)           \
+	_(IGOTO_DUP)           \

 #define IR_RULE_ENUM(name) IR_ ## name,

@@ -385,7 +387,7 @@ int ir_get_target_constraints(ir_ctx *ctx, ir_ref ref, ir_target_constraints *co
 						n++;
 						break;
 				}
-			} else if (ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA) {
+			} else if (!IR_IS_CONST_REF(insn->op2) && ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA) {
 				constraints->tmp_regs[n] = IR_TMP_REG(2, IR_ADDR, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
 				n++;
 			}
@@ -478,10 +480,16 @@ int ir_get_target_constraints(ir_ctx *ctx, ir_ref ref, ir_target_constraints *co
 			if (IR_IS_CONST_REF(insn->op1)) {
 				constraints->tmp_regs[n] = IR_TMP_REG(1, insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
 				n++;
+			} else if (ir_rule(ctx, insn->op1) == IR_STATIC_ALLOCA) {
+				constraints->tmp_regs[n] = IR_TMP_REG(1, insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
+				n++;
 			}
 			if (IR_IS_CONST_REF(insn->op2) && insn->op1 != insn->op2) {
 				constraints->tmp_regs[n] = IR_TMP_REG(2, insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
 				n++;
+			} else if (!IR_IS_CONST_REF(insn->op2) && ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA) {
+				constraints->tmp_regs[n] = IR_TMP_REG(2, insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
+				n++;
 			}
 			break;
 		case IR_CMP_INT:
@@ -520,6 +528,7 @@ int ir_get_target_constraints(ir_ctx *ctx, ir_ref ref, ir_target_constraints *co
 			}
 			break;
 		case IR_VSTORE:
+		case IR_VSTORE_v:
 			insn = &ctx->ir_base[ref];
 			if (IR_IS_CONST_REF(insn->op3)) {
 				insn = &ctx->ir_base[insn->op3];
@@ -596,6 +605,19 @@ int ir_get_target_constraints(ir_ctx *ctx, ir_ref ref, ir_target_constraints *co
 			}
 			flags = IR_USE_SHOULD_BE_IN_REG | IR_OP2_MUST_BE_IN_REG | IR_OP3_SHOULD_BE_IN_REG;
 			break;
+		case IR_IGOTO:
+			insn = &ctx->ir_base[ref];
+			if (ctx->ir_base[insn->op1].op == IR_MERGE || ctx->ir_base[insn->op1].op == IR_LOOP_BEGIN) {
+				ir_insn *merge = &ctx->ir_base[insn->op1];
+				ir_ref *p, n = merge->inputs_count;
+
+				for (p = merge->ops + 1; n > 0; p++, n--) {
+					ir_ref input = *p;
+					IR_ASSERT(ctx->ir_base[input].op == IR_END || ctx->ir_base[input].op == IR_LOOP_END);
+					ctx->rules[input] = IR_IGOTO_DUP;
+				}
+			}
+			return insn->op;
 		case IR_COND:
 			insn = &ctx->ir_base[ref];
 			n = 0;
@@ -665,7 +687,7 @@ int ir_get_target_constraints(ir_ctx *ctx, ir_ref ref, ir_target_constraints *co
 			}
 			break;
 		case IR_VA_ARG:
-			flags = IR_USE_MUST_BE_IN_REG | IR_OP2_MUST_BE_IN_REG;
+			flags = IR_USE_MUST_BE_IN_REG | IR_OP2_MUST_BE_IN_REG | IR_DEF_CONFLICTS_WITH_INPUT_REGS;
 			constraints->tmp_regs[0] = IR_TMP_REG(3, IR_ADDR, IR_LOAD_SUB_REF, IR_SAVE_SUB_REF);
 			n = 1;
 			insn = &ctx->ir_base[ref];
@@ -714,7 +736,8 @@ static void ir_match_fuse_addr(ir_ctx *ctx, ir_ref addr_ref, ir_type type)

 				do {
 					ir_insn *insn = &ctx->ir_base[*p];
-					if (insn->op != IR_LOAD && (insn->op != IR_STORE || insn->op3 == addr_ref)) {
+					if (insn->op != IR_LOAD && insn->op != IR_LOAD_v
+					 && ((insn->op != IR_STORE && insn->op != IR_STORE_v) || insn->op3 == addr_ref)) {
 						return;
 					}
 					p++;
@@ -961,7 +984,7 @@ binop_fp:
 			ctx->flags2 |= IR_HAS_CALLS;
 			return IR_CALL;
 		case IR_VAR:
-			return IR_SKIPPED | IR_VAR;
+			return IR_STATIC_ALLOCA;
 		case IR_PARAM:
 			return ctx->use_lists[ref].count > 0 ? IR_PARAM : IR_SKIPPED | IR_PARAM;
 		case IR_ALLOCA:
@@ -978,6 +1001,7 @@ binop_fp:
 			}
 			return IR_ALLOCA;
 		case IR_LOAD:
+		case IR_LOAD_v:
 			ir_match_fuse_addr(ctx, insn->op2, insn->type);
 			if (IR_IS_TYPE_INT(insn->type)) {
 				return IR_LOAD_INT;
@@ -986,6 +1010,7 @@ binop_fp:
 			}
 			break;
 		case IR_STORE:
+		case IR_STORE_v:
 			ir_match_fuse_addr(ctx, insn->op2, ctx->ir_base[insn->op3].type);
 			if (IR_IS_TYPE_INT(ctx->ir_base[insn->op3].type)) {
 				return IR_STORE_INT;
@@ -1364,7 +1389,7 @@ static void ir_emit_load_imm_fp(ir_ctx *ctx, ir_type type, ir_reg reg, ir_ref sr
 	} else if (type == IR_DOUBLE && insn->val.u64 == 0) {
 		|	fmov Rd(reg-IR_REG_FP_FIRST), xzr
 	} else {
-		label = ir_const_label(ctx, src);
+		label = ir_get_const_label(ctx, src);
 		if (type == IR_DOUBLE) {
 			|	ldr Rd(reg-IR_REG_FP_FIRST), =>label
 		} else {
@@ -1441,10 +1466,41 @@ static void ir_load_local_addr(ir_ctx *ctx, ir_reg reg, ir_ref src)
 		|	add Rx(reg), Rx(base), #offset
 	} else {
 		ir_emit_load_imm_int(ctx, IR_ADDR, IR_REG_INT_TMP, offset);
-		|	add Rx(reg), sp, Rx(IR_REG_INT_TMP)
+		|	add Rx(reg), Rx(base), Rx(IR_REG_INT_TMP)
 	}
 }

+static void ir_resolve_label_syms(ir_ctx *ctx)
+{
+	uint32_t b;
+	ir_block *bb;
+
+	for (b = 1, bb = &ctx->cfg_blocks[b]; b <= ctx->cfg_blocks_count; bb++, b++) {
+		ir_insn *insn = &ctx->ir_base[bb->start];
+
+		if (insn->op == IR_BEGIN && insn->op2) {
+			IR_ASSERT(ctx->ir_base[insn->op2].op == IR_LABEL);
+			ctx->ir_base[insn->op2].val.u32_hi = b;
+		}
+	}
+}
+
+static void ir_emit_load_label_addr(ir_ctx *ctx, ir_reg reg, ir_insn *label)
+{
+	ir_backend_data *data = ctx->data;
+	dasm_State **Dst = &data->dasm_state;
+
+	if (!data->resolved_label_syms) {
+		data->resolved_label_syms = 1;
+		ir_resolve_label_syms(ctx);
+	}
+
+	IR_ASSERT(label->op == IR_LABEL);
+	int b = label->val.u32_hi;
+
+	b = ir_skip_empty_target_blocks(ctx, b);
+	|	adr Rx(reg), =>b
+}

 static void ir_emit_load(ir_ctx *ctx, ir_type type, ir_reg reg, ir_ref src)
 {
@@ -1459,9 +1515,11 @@ static void ir_emit_load(ir_ctx *ctx, ir_type type, ir_reg reg, ir_ref src)
 			} else if (insn->op == IR_STR) {
 				ir_backend_data *data = ctx->data;
 				dasm_State **Dst = &data->dasm_state;
-				int label = ir_const_label(ctx, src);
+				int label = ir_get_const_label(ctx, src);

 				|	adr Rx(reg), =>label
+			} else if (insn->op == IR_LABEL) {
+				ir_emit_load_label_addr(ctx, reg, insn);
 			} else {
 				ir_emit_load_imm_int(ctx, type, reg, insn->val.i64);
 			}
@@ -1697,6 +1755,7 @@ static void ir_emit_prologue(ir_ctx *ctx)
 							|	str Rd(i-IR_REG_FP_FIRST), [Rx(fp), #offset]
 						} else {
 							ir_emit_load_imm_int(ctx, IR_ADDR, IR_REG_INT_TMP, offset);
+							offset -= sizeof(void*);
 							|	str Rx(prev), [Rx(fp), Rx(IR_REG_INT_TMP)]
 							|	sub Rx(IR_REG_INT_TMP), Rx(IR_REG_INT_TMP), #8
 							|	str Rd(i-IR_REG_FP_FIRST), [Rx(fp), Rx(IR_REG_INT_TMP)]
@@ -1795,7 +1854,12 @@ static void ir_emit_prologue(ir_ctx *ctx)
 			offset += 16 * ctx->fp_reg_params;
 			for (i = ctx->fp_reg_params; i < IR_REG_FP_ARGS; i++) {
 				// TODO: Rd->Rq stur->str ???
-				|	str Rd(fp_reg_params[i]-IR_REG_FP_FIRST), [Rx(fp), #offset]
+				if (aarch64_may_encode_addr_offset(offset, 8)) {
+					|	str Rd(fp_reg_params[i]-IR_REG_FP_FIRST), [Rx(fp), #offset]
+				} else {
+					ir_emit_load_imm_int(ctx, IR_ADDR, IR_REG_INT_TMP, offset);
+					|	str Rd(fp_reg_params[i]-IR_REG_FP_FIRST), [Rx(fp), Rx(IR_REG_INT_TMP)]
+				}
 				offset += 16;
 			}
 		}
@@ -1828,26 +1892,44 @@ static void ir_emit_epilogue(ir_ctx *ctx)
 					offset -= sizeof(void*) * 2;
 					if (aarch64_may_encode_imm7_addr_offset(offset, 8)) {
 						|	ldp Rx(prev), Rx(i), [Rx(fp), #offset]
-					} else {
-						IR_ASSERT(aarch64_may_encode_addr_offset(offset, 8));
+					} else if (aarch64_may_encode_addr_offset(offset + 8, 8)) {
 						|	ldr Rx(prev), [Rx(fp), #offset]
 						|	ldr Rx(i), [Rx(fp), #(offset+8)]
+					} else {
+						ir_emit_load_imm_int(ctx, IR_ADDR, IR_REG_INT_TMP, offset);
+						|	ldr Rx(prev), [Rx(fp), Rx(IR_REG_INT_TMP)]
+						|	add Rx(IR_REG_INT_TMP), Rx(IR_REG_INT_TMP), #8
+						|	ldr Rx(i), [Rx(fp), Rx(IR_REG_INT_TMP)]
 					}
 					prev = IR_REG_NONE;
 				} else {
 					if (prev < IR_REG_FP_FIRST) {
 						offset -= sizeof(void*);
-						|	ldr Rx(prev), [Rx(fp), #offset]
+						if (aarch64_may_encode_addr_offset(offset, 8)) {
+							|	ldr Rx(prev), [Rx(fp), #offset]
+						} else {
+							ir_emit_load_imm_int(ctx, IR_ADDR, IR_REG_INT_TMP, offset);
+							|	ldr Rx(prev), [Rx(fp), Rx(IR_REG_INT_TMP)]
+						}
 						offset -= sizeof(void*);
-						|	ldr Rd(i-IR_REG_FP_FIRST), [Rx(fp), #offset]
+						if (aarch64_may_encode_addr_offset(offset, 8)) {
+							|	ldr Rd(i-IR_REG_FP_FIRST), [Rx(fp), #offset]
+						} else {
+							ir_emit_load_imm_int(ctx, IR_ADDR, IR_REG_INT_TMP, offset);
+							|	ldr Rd(i-IR_REG_FP_FIRST), [Rx(fp), Rx(IR_REG_INT_TMP)]
+						}
 					} else {
 						offset -= sizeof(void*) * 2;
 						if (aarch64_may_encode_imm7_addr_offset(offset, 8)) {
 							|	ldp Rd(prev-IR_REG_FP_FIRST), Rd(i-IR_REG_FP_FIRST), [Rx(fp), #offset]
-						} else {
-							IR_ASSERT(aarch64_may_encode_addr_offset(offset, 8));
+						} else if (aarch64_may_encode_addr_offset(offset + 8, 8)) {
 							|	ldr Rd(prev-IR_REG_FP_FIRST), [Rx(fp), #offset]
 							|	ldr Rd(i-IR_REG_FP_FIRST), [Rx(fp), #(offset+8)]
+						} else {
+							ir_emit_load_imm_int(ctx, IR_ADDR, IR_REG_INT_TMP, offset);
+							|	ldr Rx(prev-IR_REG_FP_FIRST), [Rx(fp), Rx(IR_REG_INT_TMP)]
+							|	add Rx(IR_REG_INT_TMP), Rx(IR_REG_INT_TMP), #8
+							|	ldr Rx(i-IR_REG_FP_FIRST), [Rx(fp), Rx(IR_REG_INT_TMP)]
 						}
 					}
 					prev = IR_REG_NONE;
@@ -1857,10 +1939,20 @@ static void ir_emit_epilogue(ir_ctx *ctx)
 	    if (prev != IR_REG_NONE) {
 			if (prev < IR_REG_FP_FIRST) {
 				offset -= sizeof(void*);
-				|	ldr Rx(prev), [Rx(fp), #offset]
+				if (aarch64_may_encode_addr_offset(offset, 8)) {
+					|	ldr Rx(prev), [Rx(fp), #offset]
+				} else {
+					ir_emit_load_imm_int(ctx, IR_ADDR, IR_REG_INT_TMP, offset);
+					|	ldr Rx(prev), [Rx(fp), Rx(IR_REG_INT_TMP)]
+				}
 			} else {
 				offset -= sizeof(void*);
-				|	ldr Rd(prev-IR_REG_FP_FIRST), [Rx(fp), #offset]
+				if (aarch64_may_encode_addr_offset(offset, 8)) {
+					|	ldr Rd(prev-IR_REG_FP_FIRST), [Rx(fp), #offset]
+				} else {
+					ir_emit_load_imm_int(ctx, IR_ADDR, IR_REG_INT_TMP, offset);
+					|	ldr Rd(prev-IR_REG_FP_FIRST), [Rx(fp), Rx(IR_REG_INT_TMP)]
+				}
 			}
 		}
 	}
@@ -1909,6 +2001,9 @@ static void ir_emit_binop_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
 		op1_reg = IR_REG_NUM(op1_reg);
 		ir_emit_load(ctx, type, op1_reg, op1);
 	}
+	if (op2_reg == IR_REG_NONE && op1 == op2) {
+		op2_reg = op1_reg;
+	}
 	if (op2_reg != IR_REG_NONE) {
 		if (IR_REG_SPILLED(op2_reg)) {
 			op2_reg = IR_REG_NUM(op2_reg);
@@ -3415,25 +3510,52 @@ static void ir_emit_sext(ir_ctx *ctx, ir_ref def, ir_insn *insn)
 		int32_t offset = ir_ref_spill_slot_offset(ctx, insn->op1, &fp);

 		if (ir_type_size[src_type] == 1) {
-			if (ir_type_size[dst_type] == 2) {
-				|	ldrsb Rw(def_reg), [Rx(fp), #offset]
-			} else if (ir_type_size[dst_type] == 4) {
-				|	ldrsb Rw(def_reg), [Rx(fp), #offset]
+			if (aarch64_may_encode_addr_offset(offset, ir_type_size[src_type])) {
+				if (ir_type_size[dst_type] == 2) {
+					|	ldrsb Rw(def_reg), [Rx(fp), #offset]
+				} else if (ir_type_size[dst_type] == 4) {
+					|	ldrsb Rw(def_reg), [Rx(fp), #offset]
+				} else {
+					IR_ASSERT(ir_type_size[dst_type] == 8);
+					|	ldrsb Rx(def_reg), [Rx(fp), #offset]
+				}
 			} else {
-				IR_ASSERT(ir_type_size[dst_type] == 8);
-				|	ldrsb Rx(def_reg), [Rx(fp), #offset]
+				ir_emit_load_imm_int(ctx, IR_ADDR, IR_REG_INT_TMP, offset);
+				if (ir_type_size[dst_type] == 2) {
+					|	ldrsb Rw(def_reg), [Rx(fp), Rx(IR_REG_INT_TMP)]
+				} else if (ir_type_size[dst_type] == 4) {
+					|	ldrsb Rw(def_reg), [Rx(fp), Rx(IR_REG_INT_TMP)]
+				} else {
+					IR_ASSERT(ir_type_size[dst_type] == 8);
+					|	ldrsb Rx(def_reg), [Rx(fp), Rx(IR_REG_INT_TMP)]
+				}
 			}
 		} else if (ir_type_size[src_type] == 2) {
-			if (ir_type_size[dst_type] == 4) {
-				|	ldrsh Rw(def_reg), [Rx(fp), #offset]
+			if (aarch64_may_encode_addr_offset(offset, ir_type_size[src_type])) {
+				if (ir_type_size[dst_type] == 4) {
+					|	ldrsh Rw(def_reg), [Rx(fp), #offset]
+				} else {
+					IR_ASSERT(ir_type_size[dst_type] == 8);
+					|	ldrsh Rx(def_reg), [Rx(fp), #offset]
+				}
 			} else {
-				IR_ASSERT(ir_type_size[dst_type] == 8);
-				|	ldrsh Rx(def_reg), [Rx(fp), #offset]
+				ir_emit_load_imm_int(ctx, IR_ADDR, IR_REG_INT_TMP, offset);
+				if (ir_type_size[dst_type] == 4) {
+					|	ldrsh Rw(def_reg), [Rx(fp), Rx(IR_REG_INT_TMP)]
+				} else {
+					IR_ASSERT(ir_type_size[dst_type] == 8);
+					|	ldrsh Rx(def_reg), [Rx(fp), Rx(IR_REG_INT_TMP)]
+				}
 			}
 		} else {
 			IR_ASSERT(ir_type_size[src_type] == 4);
 			IR_ASSERT(ir_type_size[dst_type] == 8);
-			|	ldrsw Rx(def_reg), [Rx(fp), #offset]
+			if (aarch64_may_encode_addr_offset(offset, ir_type_size[src_type])) {
+				|	ldrsw Rx(def_reg), [Rx(fp), #offset]
+			} else {
+				ir_emit_load_imm_int(ctx, IR_ADDR, IR_REG_INT_TMP, offset);
+				|	ldrsw Rx(def_reg), [Rx(fp), Rx(IR_REG_INT_TMP)]
+			}
 		}
 	}
 	if (IR_REG_SPILLED(ctx->regs[def][0])) {
@@ -3473,14 +3595,27 @@ static void ir_emit_zext(ir_ctx *ctx, ir_ref def, ir_insn *insn)
 		ir_reg fp;
 		int32_t offset = ir_ref_spill_slot_offset(ctx, insn->op1, &fp);

-		if (ir_type_size[src_type] == 1) {
-			|	ldrb Rw(def_reg), [Rx(fp), #offset]
-		} else if (ir_type_size[src_type] == 2) {
-			|	ldrh Rw(def_reg), [Rx(fp), #offset]
+		if (aarch64_may_encode_addr_offset(offset, ir_type_size[src_type])) {
+			if (ir_type_size[src_type] == 1) {
+				|	ldrb Rw(def_reg), [Rx(fp), #offset]
+			} else if (ir_type_size[src_type] == 2) {
+				|	ldrh Rw(def_reg), [Rx(fp), #offset]
+			} else {
+				IR_ASSERT(ir_type_size[src_type] == 4);
+				IR_ASSERT(ir_type_size[dst_type] == 8);
+				|	ldr Rw(def_reg), [Rx(fp), #offset]
+			}
 		} else {
-			IR_ASSERT(ir_type_size[src_type] == 4);
-			IR_ASSERT(ir_type_size[dst_type] == 8);
-			|	ldr Rw(def_reg), [Rx(fp), #offset]
+			ir_emit_load_imm_int(ctx, IR_ADDR, IR_REG_INT_TMP, offset);
+			if (ir_type_size[src_type] == 1) {
+				|	ldrb Rw(def_reg), [Rx(fp), Rx(IR_REG_INT_TMP)]
+			} else if (ir_type_size[src_type] == 2) {
+				|	ldrh Rw(def_reg), [Rx(fp), Rx(IR_REG_INT_TMP)]
+			} else {
+				IR_ASSERT(ir_type_size[src_type] == 4);
+				IR_ASSERT(ir_type_size[dst_type] == 8);
+				|	ldr Rw(def_reg), [Rx(fp), Rx(IR_REG_INT_TMP)]
+			}
 		}
 	}
 	if (IR_REG_SPILLED(ctx->regs[def][0])) {
@@ -3579,11 +3714,21 @@ static void ir_emit_bitcast(ir_ctx *ctx, ir_ref def, ir_insn *insn)
 			ir_reg fp;
 			int32_t offset = ir_ref_spill_slot_offset(ctx, insn->op1, &fp);

-			if (src_type == IR_DOUBLE) {
-				|	ldr Rx(def_reg), [Rx(fp), #offset]
+			if (aarch64_may_encode_addr_offset(offset, ir_type_size[src_type])) {
+				if (src_type == IR_DOUBLE) {
+					|	ldr Rx(def_reg), [Rx(fp), #offset]
+				} else {
+					IR_ASSERT(src_type == IR_FLOAT);
+					|	ldr Rw(def_reg), [Rx(fp), #offset]
+				}
 			} else {
-				IR_ASSERT(src_type == IR_FLOAT);
-				|	ldr Rw(def_reg), [Rx(fp), #offset]
+				ir_emit_load_imm_int(ctx, IR_ADDR, IR_REG_INT_TMP, offset);
+				if (src_type == IR_DOUBLE) {
+					|	ldr Rx(def_reg), [Rx(fp), Rx(IR_REG_INT_TMP)]
+				} else {
+					IR_ASSERT(src_type == IR_FLOAT);
+					|	ldr Rw(def_reg), [Rx(fp), Rx(IR_REG_INT_TMP)]
+				}
 			}
 		}
 	} else if (IR_IS_TYPE_FP(dst_type)) {
@@ -3605,12 +3750,22 @@ static void ir_emit_bitcast(ir_ctx *ctx, ir_ref def, ir_insn *insn)
 			ir_reg fp;
 			int32_t offset = ir_ref_spill_slot_offset(ctx, insn->op1, &fp);

-			if (dst_type == IR_DOUBLE) {
-				|	ldr Rd(def_reg), [Rx(fp), #offset]
-			} else {
-				IR_ASSERT(src_type == IR_FLOAT);
-				|	ldr Rs(def_reg), [Rx(fp), #offset]
-			}
+			if (aarch64_may_encode_addr_offset(offset, ir_type_size[src_type])) {
+				if (dst_type == IR_DOUBLE) {
+					|	ldr Rd(def_reg), [Rx(fp), #offset]
+				} else {
+					IR_ASSERT(dst_type == IR_FLOAT);
+					|	ldr Rs(def_reg), [Rx(fp), #offset]
+				}
+			 } else {
+				ir_emit_load_imm_int(ctx, IR_ADDR, IR_REG_INT_TMP, offset);
+				if (dst_type == IR_DOUBLE) {
+					|	ldr Rd(def_reg), [Rx(fp), Rx(IR_REG_INT_TMP)]
+				} else {
+					IR_ASSERT(dst_type == IR_FLOAT);
+					|	ldr Rs(def_reg), [Rx(fp), Rx(IR_REG_INT_TMP)]
+				}
+			 }
 		}
 	}
 	if (IR_REG_SPILLED(ctx->regs[def][0])) {
@@ -3833,7 +3988,12 @@ static void ir_emit_vaddr(ir_ctx *ctx, ir_ref def, ir_insn *insn)

 	IR_ASSERT(def_reg != IR_REG_NONE);
 	offset = ir_var_spill_slot(ctx, insn->op1, &fp);
-	|	add Rx(def_reg), Rx(fp), #offset
+	if (aarch64_may_encode_imm12(offset)) {
+		|	add Rx(def_reg), Rx(fp), #offset
+	} else {
+		ir_emit_load_imm_int(ctx, IR_ADDR, IR_REG_INT_TMP, offset);
+		|	add Rx(def_reg), Rx(fp), Rx(IR_REG_INT_TMP)
+	}
 	if (IR_REG_SPILLED(ctx->regs[def][0])) {
 		ir_emit_store(ctx, type, def, def_reg);
 	}
@@ -4221,7 +4381,12 @@ static void ir_emit_afree(ir_ctx *ctx, ir_ref def, ir_insn *insn)

 		/* Stack must be 16 byte aligned */
 		size = IR_ALIGNED_SIZE(size, 16);
-		|	add sp, sp, #size
+		if (aarch64_may_encode_imm12(size)) {
+			|	add sp, sp, #size
+		} else {
+			ir_emit_load_imm_int(ctx, IR_ADDR, IR_REG_INT_TMP, size);
+			|	add sp, sp, Rx(IR_REG_INT_TMP)
+		}
 		if (!(ctx->flags & IR_USE_FRAME_POINTER)) {
 			ctx->call_stack_size -= size;
 		}
@@ -4283,8 +4448,11 @@ static void ir_emit_frame_addr(ir_ctx *ctx, ir_ref def)

 	if (ctx->flags & IR_USE_FRAME_POINTER) {
 		|	mov Rx(def_reg), Rx(IR_REG_X29)
-	} else {
+	} else if (aarch64_may_encode_imm12(ctx->stack_frame_size + ctx->call_stack_size)) {
 		|	add Rx(def_reg), Rx(IR_REG_X31), #(ctx->stack_frame_size + ctx->call_stack_size)
+	} else {
+		ir_emit_load_imm_int(ctx, IR_ADDR, IR_REG_INT_TMP, ctx->stack_frame_size + ctx->call_stack_size);
+		|	add Rx(def_reg), Rx(IR_REG_X31), Rx(IR_REG_INT_TMP)
 	}
 	if (IR_REG_SPILLED(ctx->regs[def][0])) {
 		ir_emit_store(ctx, IR_ADDR, def, def_reg);
@@ -4377,7 +4545,7 @@ static void ir_emit_va_start(ir_ctx *ctx, ir_ref def, ir_insn *insn)
 	if ((ctx->flags2 & (IR_HAS_VA_ARG_FP|IR_HAS_VA_COPY)) && ctx->fp_reg_params < IR_REG_FP_ARGS) {
 		reg_save_area_offset += 16 * IR_REG_FP_ARGS;
 		/* Set va_list.vr_top */
-		if (overflow_arg_area_offset != reg_save_area_offset) {
+		if (overflow_arg_area_offset != reg_save_area_offset || ctx->gp_reg_params < IR_REG_INT_ARGS) {
 			|	add Rx(tmp_reg), Rx(fp), #reg_save_area_offset
 		}
 		|	str Rx(tmp_reg), [Rx(op2_reg), #(offset+offsetof(ir_va_list, vr_top))]
@@ -5246,6 +5414,19 @@ static void ir_emit_ijmp(ir_ctx *ctx, ir_ref def, ir_insn *insn)
 		}
 		|	br Rx(op2_reg)
 	} else if (IR_IS_CONST_REF(insn->op2)) {
+		if (ctx->ir_base[insn->op2].op == IR_LABEL) {
+			if (!data->resolved_label_syms) {
+				data->resolved_label_syms = 1;
+				ir_resolve_label_syms(ctx);
+			}
+
+			uint32_t target = ctx->ir_base[insn->op2].val.u32_hi;
+			target = ir_skip_empty_target_blocks(ctx, target);
+
+			|	b =>target
+			return;
+		}
+
 		void *addr = ir_jmp_addr(ctx, insn, &ctx->ir_base[insn->op2]);

 		if (aarch64_may_use_b(ctx->code_buffer, addr)) {
@@ -5636,6 +5817,7 @@ static void ir_emit_param_move(ir_ctx *ctx, uint8_t type, ir_reg from_reg, ir_re
 {
 	ir_reg fp = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;

+	offset = IR_SPILL_POS_TO_OFFSET(offset);
 	IR_ASSERT(from_reg != IR_REG_NONE || to_reg != IR_REG_NONE);

 	if (IR_IS_TYPE_INT(type)) {
@@ -5676,13 +5858,8 @@ static void ir_emit_load_params(ir_ctx *ctx)
 	const int8_t *int_reg_params = _ir_int_reg_params;
 	const int8_t *fp_reg_params = _ir_fp_reg_params;
 	int32_t stack_offset = 0;
+	int32_t stack_start = ctx->stack_frame_size;

-	if (ctx->flags & IR_USE_FRAME_POINTER) {
-		/* skip old frame pointer and return address */
-		stack_offset = sizeof(void*) * 2 + ctx->stack_frame_size + ctx->call_stack_size;
-	} else {
-		stack_offset = ctx->stack_frame_size + ctx->call_stack_size;
-	}
 	n = use_list->count;
 	for (i = 0, p = &ctx->use_edges[use_list->refs]; i < n; i++, p++) {
 		use = *p;
@@ -5706,12 +5883,9 @@ static void ir_emit_load_params(ir_ctx *ctx)
 			if (ctx->vregs[use]) {
 				dst_reg = IR_REG_NUM(ctx->regs[use][0]);
 				IR_ASSERT(src_reg != IR_REG_NONE || dst_reg != IR_REG_NONE ||
-					stack_offset == ctx->live_intervals[ctx->vregs[use]]->stack_spill_pos +
-						((ctx->flags & IR_USE_FRAME_POINTER) ?
-							-(ctx->stack_frame_size - ctx->stack_frame_alignment) :
-							ctx->call_stack_size));
+					stack_start + stack_offset == ctx->live_intervals[ctx->vregs[use]]->stack_spill_pos);
 				if (src_reg != dst_reg) {
-					ir_emit_param_move(ctx, insn->type, src_reg, dst_reg, use, stack_offset);
+					ir_emit_param_move(ctx, insn->type, src_reg, dst_reg, use, stack_start + stack_offset);
 				}
 				if (dst_reg != IR_REG_NONE && IR_REG_SPILLED(ctx->regs[use][0])) {
 					ir_emit_store(ctx, insn->type, use, dst_reg);
@@ -5785,14 +5959,8 @@ static void ir_fix_param_spills(ir_ctx *ctx)
 	const int8_t *int_reg_params = _ir_int_reg_params;
 	const int8_t *fp_reg_params = _ir_fp_reg_params;
 	int32_t stack_offset = 0;
-	int32_t param_stack_size = 0;
+	int32_t stack_start = ctx->stack_frame_size;

-	if (ctx->flags & IR_USE_FRAME_POINTER) {
-		/* skip old frame pointer and return address */
-		stack_offset = sizeof(void*) * 2 + (ctx->stack_frame_size - ctx->stack_frame_alignment);
-	} else {
-		stack_offset = ctx->stack_frame_size;
-	}
 	n = use_list->count;
 	for (i = 0, p = &ctx->use_edges[use_list->refs]; i < n; i++, p++) {
 		use = *p;
@@ -5819,15 +5987,13 @@ static void ir_fix_param_spills(ir_ctx *ctx)
 					if ((ival->flags & IR_LIVE_INTERVAL_MEM_PARAM)
 					 && ival->stack_spill_pos == -1
 					 && (ival->next || ival->reg == IR_REG_NONE)) {
-						ival->stack_spill_pos = stack_offset;
+						ival->stack_spill_pos = stack_start + stack_offset;
 					}
 				}
 				if (sizeof(void*) == 8) {
 					stack_offset += sizeof(void*);
-					param_stack_size += sizeof(void*);
 				} else {
 					stack_offset += IR_MAX(sizeof(void*), ir_type_size[insn->type]);
-					param_stack_size += IR_MAX(sizeof(void*), ir_type_size[insn->type]);
 				}
 			}
 		}
@@ -5835,7 +6001,7 @@ static void ir_fix_param_spills(ir_ctx *ctx)

 	ctx->gp_reg_params = IR_MIN(int_param_num, int_reg_params_count);
 	ctx->fp_reg_params = IR_MIN(fp_param_num, fp_reg_params_count);
-	ctx->param_stack_size = param_stack_size;
+	ctx->param_stack_size = stack_offset;
 }

 static void ir_allocate_unique_spill_slots(ir_ctx *ctx)
@@ -5876,6 +6042,7 @@ static void ir_allocate_unique_spill_slots(ir_ctx *ctx)
 				case IR_MERGE:
 				case IR_LOOP_BEGIN:
 				case IR_LOOP_END:
+				case IR_IGOTO_DUP:
 					break;
 				default:
 					def_flags = ir_get_target_constraints(ctx, i, &constraints);
@@ -5892,7 +6059,7 @@ static void ir_allocate_unique_spill_slots(ir_ctx *ctx)
 							IR_REGSET_EXCL(available, reg);
 							ctx->regs[i][0] = reg | IR_REG_SPILL_STORE;
 						} else if (def_flags & IR_USE_MUST_BE_IN_REG) {
-							if (insn->op == IR_VLOAD
+							if ((insn->op == IR_VLOAD || insn->op == IR_VLOAD_v)
 							 && ctx->live_intervals[ctx->vregs[i]]
 							 && ctx->live_intervals[ctx->vregs[i]]->stack_spill_pos != -1
 							 && ir_is_same_mem_var(ctx, i, ctx->ir_base[insn->op2].op3)) {
@@ -5932,7 +6099,7 @@ static void ir_allocate_unique_spill_slots(ir_ctx *ctx)
 							for (i = 0, p = &ctx->use_edges[use_list->refs]; i < n; i++, p++) {
 								use = *p;
 								use_insn = &ctx->ir_base[use];
-								if (use_insn->op == IR_VLOAD) {
+								if (use_insn->op == IR_VLOAD || use_insn->op == IR_VLOAD_v) {
 									if (ctx->vregs[use]
 									 && !ctx->live_intervals[ctx->vregs[use]]) {
 										ir_live_interval *ival = ir_arena_alloc(&ctx->arena, sizeof(ir_live_interval));
@@ -5943,7 +6110,7 @@ static void ir_allocate_unique_spill_slots(ir_ctx *ctx)
 										ival->vreg = ctx->vregs[use];
 										ival->stack_spill_pos = stack_spill_pos;
 									}
-								} else if (use_insn->op == IR_VSTORE) {
+								} else if (use_insn->op == IR_VSTORE || use_insn->op == IR_STORE_v) {
 									if (!IR_IS_CONST_REF(use_insn->op3)
 									 && ctx->vregs[use_insn->op3]
 									 && !ctx->live_intervals[ctx->vregs[use_insn->op3]]) {
@@ -6080,25 +6247,21 @@ void ir_fix_stack_frame(ir_ctx *ctx)

 	ctx->stack_frame_size = IR_ALIGNED_SIZE(ctx->stack_frame_size, sizeof(void*));
 	ctx->stack_frame_size += additional_size;
-	ctx->stack_frame_alignment = 0;
 	ctx->call_stack_size = 0;

 	if (!(ctx->flags & IR_FUNCTION)) {
 		while (IR_ALIGNED_SIZE(ctx->stack_frame_size, 16) != ctx->stack_frame_size) {
 			ctx->stack_frame_size += sizeof(void*);
-			ctx->stack_frame_alignment += sizeof(void*);
 		}
 	} else {
 		/* Stack must be 16 byte aligned */
 		if (!(ctx->flags & IR_FUNCTION)) {
 			while (IR_ALIGNED_SIZE(ctx->stack_frame_size, 16) != ctx->stack_frame_size) {
 				ctx->stack_frame_size += sizeof(void*);
-				ctx->stack_frame_alignment += sizeof(void*);
 			}
 		} else if (ctx->flags & IR_USE_FRAME_POINTER) {
 			while (IR_ALIGNED_SIZE(ctx->stack_frame_size + sizeof(void*) * 2, 16) != ctx->stack_frame_size + sizeof(void*) * 2) {
 				ctx->stack_frame_size += sizeof(void*);
-				ctx->stack_frame_alignment += sizeof(void*);
 			}
 		} else {
 			if (!(ctx->flags & IR_NO_STACK_COMBINE)) {
@@ -6107,7 +6270,6 @@ void ir_fix_stack_frame(ir_ctx *ctx)
 			while (IR_ALIGNED_SIZE(ctx->stack_frame_size + ctx->call_stack_size, 16) !=
 					ctx->stack_frame_size + ctx->call_stack_size) {
 				ctx->stack_frame_size += sizeof(void*);
-				ctx->stack_frame_alignment += sizeof(void*);
 			}
 		}
 	}
@@ -6143,6 +6305,8 @@ void *ir_emit_code(ir_ctx *ctx, size_t *size_ptr)
 	int ret;
 	void *entry;
 	size_t size;
+	ir_ref igoto_dup_ref = IR_UNUSED;
+	uint32_t igoto_dup_block = 0;

 	data.ra_data.unused_slot_4 = 0;
 	data.ra_data.unused_slot_2 = 0;
@@ -6150,11 +6314,11 @@ void *ir_emit_code(ir_ctx *ctx, size_t *size_ptr)
 	data.ra_data.handled = NULL;
 	data.rodata_label = 0;
 	data.jmp_table_label = 0;
+	data.resolved_label_syms = 0;
 	ctx->data = &data;

 	if (!ctx->live_intervals) {
 		ctx->stack_frame_size = 0;
-		ctx->stack_frame_alignment = 0;
 		ctx->call_stack_size = 0;
 		ctx->used_preserved_regs = 0;
 		ir_allocate_unique_spill_slots(ctx);
@@ -6176,7 +6340,6 @@ void *ir_emit_code(ir_ctx *ctx, size_t *size_ptr)
 		}
 		ctx->stack_frame_size = ctx->fixed_stack_frame_size;
 		ctx->call_stack_size = ctx->fixed_call_stack_size;
-		ctx->stack_frame_alignment = 0;
 	}

 	Dst = &data.dasm_state;
@@ -6386,6 +6549,35 @@ void *ir_emit_code(ir_ctx *ctx, size_t *size_ptr)
 				case IR_TAILCALL:
 					ir_emit_tailcall(ctx, i, insn);
 					break;
+				case IR_IGOTO_DUP:
+					if (bb->flags & IR_BB_DESSA_MOVES) {
+						ir_emit_dessa_moves(ctx, b, bb);
+					}
+					IR_ASSERT(!igoto_dup_ref && !igoto_dup_block);
+					igoto_dup_ref = i;
+					igoto_dup_block = b;
+					b = ctx->cfg_edges[bb->successors];
+					bb = &ctx->cfg_blocks[b];
+					i = bb->start;
+					insn = &ctx->ir_base[i];
+					rule = &ctx->rules[i];
+					break;
+				case IR_IGOTO:
+					if ((ctx->ir_base[insn->op1].op == IR_MERGE || ctx->ir_base[insn->op1].op == IR_LOOP_BEGIN)
+					 && (ctx->rules[ctx->ir_base[insn->op1].op1] & IR_RULE_MASK) == IR_IGOTO_DUP
+					 && igoto_dup_ref) {
+						ir_emit_ijmp(ctx, i, insn);
+						b = igoto_dup_block;
+						bb = &ctx->cfg_blocks[b];
+						i = igoto_dup_ref;
+						insn = &ctx->ir_base[i];
+						rule = &ctx->rules[i];
+						igoto_dup_block= 0;
+						igoto_dup_ref = 0;
+						break;
+					}
+					IR_ASSERT(!igoto_dup_ref && !igoto_dup_block);
+					IR_FALLTHROUGH;
 				case IR_IJMP:
 					ir_emit_ijmp(ctx, i, insn);
 					break;
@@ -6396,9 +6588,11 @@ void *ir_emit_code(ir_ctx *ctx, size_t *size_ptr)
 					ir_emit_vaddr(ctx, i, insn);
 					break;
 				case IR_VLOAD:
+				case IR_VLOAD_v:
 					ir_emit_vload(ctx, i, insn);
 					break;
 				case IR_VSTORE:
+				case IR_VSTORE_v:
 					ir_emit_vstore(ctx, i, insn);
 					break;
 				case IR_RLOAD:
@@ -6645,6 +6839,28 @@ void *ir_emit_code(ir_ctx *ctx, size_t *size_ptr)
 		} while (i != 0);
 	}

+	if ((ctx->flags2 & IR_HAS_BLOCK_ADDR) && ctx->loader && ctx->loader->add_label) {
+		for (b = 1, bb = &ctx->cfg_blocks[b]; b <= ctx->cfg_blocks_count; bb++, b++) {
+			ir_insn *insn = &ctx->ir_base[bb->start];
+
+			if (insn->op == IR_BEGIN && insn->op2) {
+				IR_ASSERT(ctx->ir_base[insn->op2].op == IR_LABEL);
+				ctx->ir_base[insn->op2].val.u32_hi = 0;
+				ctx->loader->add_label(ctx->loader, ir_get_str(ctx, ctx->ir_base[insn->op2].val.str),
+					(char*)entry + dasm_getpclabel(&data.dasm_state, ir_skip_empty_target_blocks(ctx, b)));
+			}
+		}
+	} else if (data.resolved_label_syms) {
+		for (b = 1, bb = &ctx->cfg_blocks[b]; b <= ctx->cfg_blocks_count; bb++, b++) {
+			ir_insn *insn = &ctx->ir_base[bb->start];
+
+			if (insn->op == IR_BEGIN && insn->op2) {
+				IR_ASSERT(ctx->ir_base[insn->op2].op == IR_LABEL);
+				ctx->ir_base[insn->op2].val.u32_hi = 0;
+			}
+		}
+	}
+
 	dasm_free(&data.dasm_state);

 	if (ctx->code_buffer) {
diff --git a/ext/opcache/jit/ir/ir_builder.h b/ext/opcache/jit/ir/ir_builder.h
index c1dcffdbaa0..03add759065 100644
--- a/ext/opcache/jit/ir/ir_builder.h
+++ b/ext/opcache/jit/ir/ir_builder.h
@@ -490,7 +490,7 @@ extern "C" {
 #define ir_ADD_OFFSET(_addr, _offset)     _ir_ADD_OFFSET(_ir_CTX, (_addr), (_offset))

 /* Unfoldable variant of COPY */
-#define ir_HARD_COPY(_type, _op1)         ir_emit2(_ir_CTX, IR_OPT(IR_COPY, (_type)), (_op1), 1)
+#define ir_HARD_COPY(_type, _op1)         ir_emit2(_ir_CTX, IR_OPT(IR_COPY, (_type)), (_op1), IR_COPY_HARD)
 #define ir_HARD_COPY_B(_op1)              ir_HARD_COPY(IR_BOOL, _op1)
 #define ir_HARD_COPY_U8(_op1)             ir_HARD_COPY(IR_U8, _op1)
 #define ir_HARD_COPY_U16(_op1)            ir_HARD_COPY(IR_U16, _op1)
@@ -544,6 +544,8 @@ extern "C" {
 #define ir_VLOAD_D(_var)                  _ir_VLOAD(_ir_CTX, IR_DOUBLE, (_var))
 #define ir_VLOAD_F(_var)                  _ir_VLOAD(_ir_CTX, IR_FLOAT, (_var))
 #define ir_VSTORE(_var, _val)             _ir_VSTORE(_ir_CTX, (_var), (_val))
+#define ir_VLOAD_v(_type, _var)           _ir_VLOAD_v(_ir_CTX, (_type), (_var))
+#define ir_VSTORE_v(_var, _val)           _ir_VSTORE_v(_ir_CTX, (_var), (_val))
 #define ir_RLOAD(_type, _reg)             _ir_RLOAD(_ir_CTX, (_type), (_reg))
 #define ir_RLOAD_B(_reg)                  _ir_RLOAD(_ir_CTX, IR_BOOL, (_reg))
 #define ir_RLOAD_U8(_reg)                 _ir_RLOAD(_ir_CTX, IR_U8, (_reg))
@@ -574,6 +576,8 @@ extern "C" {
 #define ir_LOAD_D(_addr)                  _ir_LOAD(_ir_CTX, IR_DOUBLE, (_addr))
 #define ir_LOAD_F(_addr)                  _ir_LOAD(_ir_CTX, IR_FLOAT, (_addr))
 #define ir_STORE(_addr, _val)             _ir_STORE(_ir_CTX, (_addr), (_val))
+#define ir_LOAD_v(_type, _addr)           _ir_LOAD_v(_ir_CTX, (_type), (_addr))
+#define ir_STORE_v(_addr, _val)           _ir_STORE_v(_ir_CTX, (_addr), (_val))
 #define ir_TLS(_index, _offset)           _ir_TLS(_ir_CTX, (_index), (_offset))
 #define ir_TRAP()                         do {_ir_CTX->control = ir_emit1(_ir_CTX, IR_TRAP, _ir_CTX->control);} while (0)

@@ -586,7 +590,7 @@ extern "C" {
 #define ir_VA_END(_list)                  _ir_VA_END(_ir_CTX, _list)
 #define ir_VA_COPY(_dst, _src)            _ir_VA_COPY(_ir_CTX, _dst, _src)
 #define ir_VA_ARG(_list, _type)           _ir_VA_ARG(_ir_CTX, _type, _list)
-#define ir_VA_ARG_EX(_list, _type, size)  _ir_VA_ARG_EX(_ir_CTX, _type, _list, size)
+#define ir_VA_ARG_EX(_list, _type, s, a)  _ir_VA_ARG_EX(_ir_CTX, _type, _list, s, a)

 #define ir_START()                        _ir_START(_ir_CTX)
 #define ir_ENTRY(_src, _num)              _ir_ENTRY(_ir_CTX, (_src), (_num))
@@ -607,6 +611,7 @@ extern "C" {
 #define ir_CASE_RANGE(_switch, _v1, _v2)  _ir_CASE_RANGE(_ir_CTX, (_switch), (_v1), (_v2))
 #define ir_CASE_DEFAULT(_switch)          _ir_CASE_DEFAULT(_ir_CTX, (_switch))
 #define ir_RETURN(_val)                   _ir_RETURN(_ir_CTX, (_val))
+#define ir_IGOTO(_addr)                   _ir_IGOTO(_ir_CTX, (_addr))
 #define ir_IJMP(_addr)                    _ir_IJMP(_ir_CTX, (_addr))
 #define ir_UNREACHABLE()                  _ir_UNREACHABLE(_ir_CTX)

@@ -654,15 +659,19 @@ ir_ref _ir_ALLOCA(ir_ctx *ctx, ir_ref size);
 void   _ir_AFREE(ir_ctx *ctx, ir_ref size);
 ir_ref _ir_VLOAD(ir_ctx *ctx, ir_type type, ir_ref var);
 void   _ir_VSTORE(ir_ctx *ctx, ir_ref var, ir_ref val);
+ir_ref _ir_VLOAD_v(ir_ctx *ctx, ir_type type, ir_ref var);
+void   _ir_VSTORE_v(ir_ctx *ctx, ir_ref var, ir_ref val);
 ir_ref _ir_RLOAD(ir_ctx *ctx, ir_type type, ir_ref reg);
 void   _ir_RSTORE(ir_ctx *ctx, ir_ref reg, ir_ref val);
 ir_ref _ir_LOAD(ir_ctx *ctx, ir_type type, ir_ref addr);
 void   _ir_STORE(ir_ctx *ctx, ir_ref addr, ir_ref val);
+ir_ref _ir_LOAD_v(ir_ctx *ctx, ir_type type, ir_ref addr);
+void   _ir_STORE_v(ir_ctx *ctx, ir_ref addr, ir_ref val);
 void   _ir_VA_START(ir_ctx *ctx, ir_ref list);
 void   _ir_VA_END(ir_ctx *ctx, ir_ref list);
 void   _ir_VA_COPY(ir_ctx *ctx, ir_ref dst, ir_ref src);
 ir_ref _ir_VA_ARG(ir_ctx *ctx, ir_type type, ir_ref list);
-ir_ref _ir_VA_ARG_EX(ir_ctx *ctx, ir_type type, ir_ref list, size_t size);
+ir_ref _ir_VA_ARG_EX(ir_ctx *ctx, ir_type type, ir_ref list, size_t size, size_t align);
 void   _ir_START(ir_ctx *ctx);
 void   _ir_ENTRY(ir_ctx *ctx, ir_ref src, ir_ref num);
 void   _ir_BEGIN(ir_ctx *ctx, ir_ref src);
@@ -688,6 +697,7 @@ void   _ir_CASE_VAL(ir_ctx *ctx, ir_ref switch_ref, ir_ref val);
 void   _ir_CASE_RANGE(ir_ctx *ctx, ir_ref switch_ref, ir_ref v1, ir_ref v2);
 void   _ir_CASE_DEFAULT(ir_ctx *ctx, ir_ref switch_ref);
 void   _ir_RETURN(ir_ctx *ctx, ir_ref val);
+ir_ref _ir_IGOTO(ir_ctx *ctx, ir_ref addr);
 void   _ir_IJMP(ir_ctx *ctx, ir_ref addr);
 void   _ir_GUARD(ir_ctx *ctx, ir_ref condition, ir_ref addr);
 void   _ir_GUARD_NOT(ir_ctx *ctx, ir_ref condition, ir_ref addr);
diff --git a/ext/opcache/jit/ir/ir_cfg.c b/ext/opcache/jit/ir/ir_cfg.c
index 00923387bb2..46755067b24 100644
--- a/ext/opcache/jit/ir/ir_cfg.c
+++ b/ext/opcache/jit/ir/ir_cfg.c
@@ -820,11 +820,14 @@ int ir_build_dominators_tree(ir_ctx *ctx)
 			succ_b = ctx->cfg_edges[bb->successors];
 			if (bb->successors_count != 1) {
 				/* LOOP_END/END may be linked with the following ENTRY by a fake edge */
-				IR_ASSERT(bb->successors_count == 2);
-				if (blocks[succ_b].flags & IR_BB_ENTRY) {
+				if (bb->successors_count != 2) {
+					complete = 0;
+					break;
+				} else if (blocks[succ_b].flags & IR_BB_ENTRY) {
 					succ_b = ctx->cfg_edges[bb->successors + 1];
-				} else {
-					IR_ASSERT(blocks[ctx->cfg_edges[bb->successors + 1]].flags & IR_BB_ENTRY);
+				} else if (!(blocks[ctx->cfg_edges[bb->successors + 1]].flags & IR_BB_ENTRY)) {
+					complete = 0;
+					break;
 				}
 			}
 			dom_depth = blocks[succ_b].dom_depth;;
diff --git a/ext/opcache/jit/ir/ir_check.c b/ext/opcache/jit/ir/ir_check.c
index c25a984aefc..ee951291b1b 100644
--- a/ext/opcache/jit/ir/ir_check.c
+++ b/ext/opcache/jit/ir/ir_check.c
@@ -328,7 +328,9 @@ bool ir_check(const ir_ctx *ctx)
 				}
 				break;
 			case IR_LOAD:
+			case IR_LOAD_v:
 			case IR_STORE:
+			case IR_STORE_v:
 				type = ctx->ir_base[insn->op2].type;
 				if (type != IR_ADDR
 				 && (!IR_IS_TYPE_INT(type) || ir_type_size[type] != ir_type_size[IR_ADDR])) {
@@ -338,7 +340,9 @@ bool ir_check(const ir_ctx *ctx)
 				}
 				break;
 			case IR_VLOAD:
+			case IR_VLOAD_v:
 			case IR_VSTORE:
+			case IR_VSTORE_v:
 				if (ctx->ir_base[insn->op2].op != IR_VAR) {
 					fprintf(stderr, "ir_base[%d].op2 must be 'VAR' (%s)\n",
 						i, ir_op_name[ctx->ir_base[insn->op2].op]);
@@ -408,6 +412,8 @@ bool ir_check(const ir_ctx *ctx)
 							ok = 0;
 						}
 						break;
+					case IR_IGOTO:
+						break;
 					default:
 						/* skip data references */
 						count = n = use_list->count;
diff --git a/ext/opcache/jit/ir/ir_dump.c b/ext/opcache/jit/ir/ir_dump.c
index a501d261f30..5cc732927d4 100644
--- a/ext/opcache/jit/ir/ir_dump.c
+++ b/ext/opcache/jit/ir/ir_dump.c
@@ -129,6 +129,11 @@ void ir_dump_dot(const ir_ctx *ctx, const char *name, FILE *f)
 					case IR_OPND_CONTROL_REF:
 						fprintf(f, "\tn%d -> n%d [style=dashed,dir=back,weight=%d];\n", ref, i, REF_WEIGHT);
 						break;
+					case IR_OPND_LABEL_REF:
+						if (ref) {
+							fprintf(f, "\tc%d -> n%d [color=blue,weight=%d];\n", -ref, i, REF_WEIGHT);
+						}
+						break;
 				}
 			}
 		}
@@ -491,6 +496,8 @@ void ir_dump_codegen(const ir_ctx *ctx, FILE *f)
 			ir_print_proto(ctx, insn->proto, f);
 		} else if (insn->op == IR_SYM) {
 			fprintf(f, "sym(%s)", ir_get_str(ctx, insn->val.name));
+		} else if (insn->op == IR_LABEL) {
+			fprintf(f, "label(%s)", ir_get_str(ctx, insn->val.name));
 		} else if (insn->op == IR_FUNC_ADDR) {
 			fprintf(f, "func *");
 			ir_print_const(ctx, insn, f, true);
@@ -648,6 +655,12 @@ void ir_dump_codegen(const ir_ctx *ctx, FILE *f)
 							fprintf(f, "%s%d", first ? "(" : ", ", ref);
 							first = 0;
 							break;
+						case IR_OPND_LABEL_REF:
+							if (ref) {
+								IR_ASSERT(IR_IS_CONST_REF(ref));
+								fprintf(f, "%sc_%d", first ? "(" : ", ", -ref);
+							}
+							break;
 					}
 				} else if (opnd_kind == IR_OPND_NUM) {
 					fprintf(f, "%s%d", first ? "(" : ", ", ref);
diff --git a/ext/opcache/jit/ir/ir_emit.c b/ext/opcache/jit/ir/ir_emit.c
index 7a10da1322a..847ca375b5b 100644
--- a/ext/opcache/jit/ir/ir_emit.c
+++ b/ext/opcache/jit/ir/ir_emit.c
@@ -244,32 +244,30 @@ static int ir_get_args_regs(const ir_ctx *ctx, const ir_insn *insn, int8_t *regs
 		ir_insn *arg = &ctx->ir_base[ir_insn_op(insn, j)];
 		type = arg->type;
 		if (IR_IS_TYPE_INT(type)) {
-			if (arg->op == IR_ARGVAL) {
-				continue;
-			} else if (int_param < int_reg_params_count) {
+			if (int_param < int_reg_params_count && arg->op != IR_ARGVAL) {
 				regs[j] = int_reg_params[int_param];
 				count = j + 1;
+				int_param++;
+#ifdef _WIN64
+				/* WIN64 calling convention use common couter for int and fp registers */
+				fp_param++;
+#endif
 			} else {
 				regs[j] = IR_REG_NONE;
 			}
-			int_param++;
-#ifdef _WIN64
-			/* WIN64 calling convention use common couter for int and fp registers */
-			fp_param++;
-#endif
 		} else {
 			IR_ASSERT(IR_IS_TYPE_FP(type));
 			if (fp_param < fp_reg_params_count) {
 				regs[j] = fp_reg_params[fp_param];
 				count = j + 1;
+				fp_param++;
+#ifdef _WIN64
+				/* WIN64 calling convention use common couter for int and fp registers */
+				int_param++;
+#endif
 			} else {
 				regs[j] = IR_REG_NONE;
 			}
-			fp_param++;
-#ifdef _WIN64
-			/* WIN64 calling convention use common couter for int and fp registers */
-			int_param++;
-#endif
 		}
 	}
 	return count;
@@ -426,7 +424,7 @@ typedef struct _ir_common_backend_data {
 	ir_bitset          emit_constants;
 } ir_common_backend_data;

-static int ir_const_label(ir_ctx *ctx, ir_ref ref)
+static int ir_get_const_label(ir_ctx *ctx, ir_ref ref)
 {
 	ir_common_backend_data *data = ctx->data;
 	int label = ctx->cfg_blocks_count - ref;
@@ -1015,11 +1013,16 @@ int ir_match(ir_ctx *ctx)
 			entries_count++;
 		}
 		ctx->rules[start] = IR_SKIPPED | IR_NOP;
+		if (ctx->ir_base[start].op == IR_BEGIN && ctx->ir_base[start].op2) {
+			ctx->flags2 |= IR_HAS_BLOCK_ADDR;
+		}
 		ref = bb->end;
 		if (bb->successors_count == 1) {
 			insn = &ctx->ir_base[ref];
 			if (insn->op == IR_END || insn->op == IR_LOOP_END) {
-				ctx->rules[ref] = insn->op;
+				if (!ctx->rules[ref]) {
+					ctx->rules[ref] = insn->op;
+				}
 				ref = prev_ref[ref];
 				if (ref == start && ctx->cfg_edges[bb->successors] != b) {
 					if (EXPECTED(!(bb->flags & IR_BB_ENTRY))) {
diff --git a/ext/opcache/jit/ir/ir_fold.h b/ext/opcache/jit/ir/ir_fold.h
index 74f7818d747..bab6b291607 100644
--- a/ext/opcache/jit/ir/ir_fold.h
+++ b/ext/opcache/jit/ir/ir_fold.h
@@ -755,8 +755,35 @@ IR_FOLD(NEG(C_FLOAT))
 }

 IR_FOLD(ABS(C_I8))
+{
+	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
+	if (op1_insn->val.i64 >= 0) {
+		IR_FOLD_COPY(op1);
+	} else {
+		IR_FOLD_CONST_I(-op1_insn->val.i8);
+	}
+}
+
 IR_FOLD(ABS(C_I16))
+{
+	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
+	if (op1_insn->val.i64 >= 0) {
+		IR_FOLD_COPY(op1);
+	} else {
+		IR_FOLD_CONST_I(-op1_insn->val.i16);
+	}
+}
+
 IR_FOLD(ABS(C_I32))
+{
+	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
+	if (op1_insn->val.i64 >= 0) {
+		IR_FOLD_COPY(op1);
+	} else {
+		IR_FOLD_CONST_I((int32_t)-op1_insn->val.u32);
+	}
+}
+
 IR_FOLD(ABS(C_I64))
 {
 	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
@@ -847,7 +874,7 @@ IR_FOLD(MUL_OV(C_U64, C_U64))
 	uint64_t res;
 	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
 	res = op1_insn->val.u64 * op2_insn->val.u64;
-	if (op1_insn->val.u64 != 0 && res / op1_insn->val.u64 != op2_insn->val.u64 && res <= max) {
+	if ((op1_insn->val.u64 != 0 && res / op1_insn->val.u64 != op2_insn->val.u64) || res > max) {
 		IR_FOLD_NEXT;
 	}
 	IR_FOLD_CONST_U(res);
@@ -864,7 +891,7 @@ IR_FOLD(MUL_OV(C_I64, C_I64))
 	int64_t res;
 	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
 	res = op1_insn->val.u64 * op2_insn->val.u64;
-	if (op1_insn->val.i64 != 0 && res / op1_insn->val.i64 != op2_insn->val.i64 && res >= min && res <= max) {
+	if ((op1_insn->val.i64 != 0 && res / op1_insn->val.i64 != op2_insn->val.i64) || res < min || res > max) {
 		IR_FOLD_NEXT;
 	}
 	IR_FOLD_CONST_U(res);
@@ -1037,220 +1064,220 @@ IR_FOLD(SHL(C_U8, C_U8))
 IR_FOLD(SHL(C_CHAR, C_CHAR))
 {
 	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
-	IR_FOLD_CONST_U(op1_insn->val.u8 << op2_insn->val.u8);
+	IR_FOLD_CONST_U(op1_insn->val.u8 << (op2_insn->val.u8 & 0x7));
 }

 IR_FOLD(SHL(C_I8, C_I8))
 {
 	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
-	IR_FOLD_CONST_I((int8_t)(op1_insn->val.u8 << op2_insn->val.u8));
+	IR_FOLD_CONST_I((int8_t)(op1_insn->val.u8 << (op2_insn->val.u8 & 0x7)));
 }

 IR_FOLD(SHL(C_U16, C_U16))
 {
 	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
-	IR_FOLD_CONST_U(op1_insn->val.u16 << op2_insn->val.u16);
+	IR_FOLD_CONST_U(op1_insn->val.u16 << (op2_insn->val.u16 & 0xf));
 }

 IR_FOLD(SHL(C_I16, C_I16))
 {
 	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
-	IR_FOLD_CONST_I((int16_t)(op1_insn->val.u16 << op2_insn->val.u16));
+	IR_FOLD_CONST_I((int16_t)(op1_insn->val.u16 << (op2_insn->val.u16 & 0xf)));
 }

 IR_FOLD(SHL(C_U32, C_U32))
 {
 	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
-	IR_FOLD_CONST_U(op1_insn->val.u32 << op2_insn->val.u32);
+	IR_FOLD_CONST_U(op1_insn->val.u32 << (op2_insn->val.u32 & 0x1f));
 }

 IR_FOLD(SHL(C_I32, C_I32))
 {
 	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
-	IR_FOLD_CONST_I((int32_t)(op1_insn->val.u32 << op2_insn->val.u32));
+	IR_FOLD_CONST_I((int32_t)(op1_insn->val.u32 << (op2_insn->val.u32 & 0x1f)));
 }

 IR_FOLD(SHL(C_U64, C_U64))
 IR_FOLD(SHL(C_I64, C_I64))
 {
 	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
-	IR_FOLD_CONST_U(op1_insn->val.u64 << op2_insn->val.u64);
+	IR_FOLD_CONST_U(op1_insn->val.u64 << (op2_insn->val.u64 & 0x3f));
 }

 IR_FOLD(SHR(C_U8, C_U8))
 IR_FOLD(SHR(C_CHAR, C_CHAR))
 {
 	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
-	IR_FOLD_CONST_U(op1_insn->val.u8 >> op2_insn->val.u8);
+	IR_FOLD_CONST_U(op1_insn->val.u8 >> (op2_insn->val.u8 & 0x7));
 }

 IR_FOLD(SHR(C_I8, C_I8))
 {
 	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
-	IR_FOLD_CONST_I((int8_t)(op1_insn->val.u8 >> op2_insn->val.u8));
+	IR_FOLD_CONST_I((int8_t)(op1_insn->val.u8 >> (op2_insn->val.u8 & 0x7)));
 }

 IR_FOLD(SHR(C_U16, C_U16))
 {
 	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
-	IR_FOLD_CONST_U(op1_insn->val.u16 >> op2_insn->val.u16);
+	IR_FOLD_CONST_U(op1_insn->val.u16 >> (op2_insn->val.u16 & 0xf));
 }

 IR_FOLD(SHR(C_I16, C_I16))
 {
 	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
-	IR_FOLD_CONST_I((int16_t)(op1_insn->val.u16 >> op2_insn->val.u16));
+	IR_FOLD_CONST_I((int16_t)(op1_insn->val.u16 >> (op2_insn->val.u16 & 0xf)));
 }

 IR_FOLD(SHR(C_U32, C_U32))
 {
 	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
-	IR_FOLD_CONST_U(op1_insn->val.u32 >> op2_insn->val.u32);
+	IR_FOLD_CONST_U(op1_insn->val.u32 >> (op2_insn->val.u32 & 0x1f));
 }

 IR_FOLD(SHR(C_I32, C_I32))
 {
 	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
-	IR_FOLD_CONST_I((int32_t)(op1_insn->val.u32 >> op2_insn->val.u32));
+	IR_FOLD_CONST_I((int32_t)(op1_insn->val.u32 >> (op2_insn->val.u32 & 0x1f)));
 }

 IR_FOLD(SHR(C_U64, C_U64))
 IR_FOLD(SHR(C_I64, C_I64))
 {
 	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
-	IR_FOLD_CONST_U(op1_insn->val.u64 >> op2_insn->val.u64);
+	IR_FOLD_CONST_U(op1_insn->val.u64 >> (op2_insn->val.u64 & 0x3f));
 }

 IR_FOLD(SAR(C_U8, C_U8))
 IR_FOLD(SAR(C_CHAR, C_CHAR))
 {
 	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
-	IR_FOLD_CONST_U((uint8_t)(op1_insn->val.i8 >> op2_insn->val.i8));
+	IR_FOLD_CONST_U((uint8_t)(op1_insn->val.i8 >> (op2_insn->val.i8 & 0x7)));
 }

 IR_FOLD(SAR(C_I8, C_I8))
 {
 	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
-	IR_FOLD_CONST_I(op1_insn->val.i8 >> op2_insn->val.i8);
+	IR_FOLD_CONST_I(op1_insn->val.i8 >> (op2_insn->val.i8 & 0x7));
 }

 IR_FOLD(SAR(C_U16, C_U16))
 {
 	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
-	IR_FOLD_CONST_U((uint16_t)(op1_insn->val.i16 >> op2_insn->val.i16));
+	IR_FOLD_CONST_U((uint16_t)(op1_insn->val.i16 >> (op2_insn->val.i16 & 0xf)));
 }

 IR_FOLD(SAR(C_I16, C_I16))
 {
 	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
-	IR_FOLD_CONST_I(op1_insn->val.i16 >> op2_insn->val.i16);
+	IR_FOLD_CONST_I(op1_insn->val.i16 >> (op2_insn->val.i16 & 0xf));
 }

 IR_FOLD(SAR(C_U32, C_U32))
 {
 	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
-	IR_FOLD_CONST_U((uint32_t)(op1_insn->val.i32 >> op2_insn->val.i32));
+	IR_FOLD_CONST_U((uint32_t)(op1_insn->val.i32 >> (op2_insn->val.i32 & 0x1f)));
 }

 IR_FOLD(SAR(C_I32, C_I32))
 {
 	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
-	IR_FOLD_CONST_I(op1_insn->val.i32 >> op2_insn->val.i32);
+	IR_FOLD_CONST_I(op1_insn->val.i32 >> (op2_insn->val.i32 & 0x1f));
 }

 IR_FOLD(SAR(C_U64, C_U64))
 IR_FOLD(SAR(C_I64, C_I64))
 {
 	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
-	IR_FOLD_CONST_I(op1_insn->val.i64 >> op2_insn->val.i64);
+	IR_FOLD_CONST_I(op1_insn->val.i64 >> (op2_insn->val.i64 & 0x3f));
 }

 IR_FOLD(ROL(C_U8, C_U8))
 IR_FOLD(ROL(C_CHAR, C_CHAR))
 {
 	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
-	IR_FOLD_CONST_U(ir_rol8(op1_insn->val.u8, op2_insn->val.u8));
+	IR_FOLD_CONST_U(ir_rol8(op1_insn->val.u8, (op2_insn->val.u8 & 0x7)));
 }

 IR_FOLD(ROL(C_I8, C_I8))
 {
 	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
-	IR_FOLD_CONST_I((int8_t)ir_rol8(op1_insn->val.u8, op2_insn->val.u8));
+	IR_FOLD_CONST_I((int8_t)ir_rol8(op1_insn->val.u8, (op2_insn->val.u8 & 0x7)));
 }

 IR_FOLD(ROL(C_U16, C_U16))
 {
 	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
-	IR_FOLD_CONST_U(ir_rol16(op1_insn->val.u16, op2_insn->val.u16));
+	IR_FOLD_CONST_U(ir_rol16(op1_insn->val.u16, (op2_insn->val.u16 & 0xf)));
 }

 IR_FOLD(ROL(C_I16, C_I16))
 {
 	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
-	IR_FOLD_CONST_I((int16_t)ir_rol16(op1_insn->val.u16, op2_insn->val.u16));
+	IR_FOLD_CONST_I((int16_t)ir_rol16(op1_insn->val.u16, (op2_insn->val.u16 & 0xf)));
 }

 IR_FOLD(ROL(C_U32, C_U32))
 {
 	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
-	IR_FOLD_CONST_U(ir_rol32(op1_insn->val.u32, op2_insn->val.u32));
+	IR_FOLD_CONST_U(ir_rol32(op1_insn->val.u32, (op2_insn->val.u32 & 0x1f)));
 }

 IR_FOLD(ROL(C_I32, C_I32))
 {
 	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
-	IR_FOLD_CONST_I((int32_t)ir_rol32(op1_insn->val.u32, op2_insn->val.u32));
+	IR_FOLD_CONST_I((int32_t)ir_rol32(op1_insn->val.u32, (op2_insn->val.u32 & 0x1f)));
 }

 IR_FOLD(ROL(C_U64, C_U64))
 IR_FOLD(ROL(C_I64, C_I64))
 {
 	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
-	IR_FOLD_CONST_U(ir_rol64(op1_insn->val.u64, op2_insn->val.u64));
+	IR_FOLD_CONST_U(ir_rol64(op1_insn->val.u64, (op2_insn->val.u64 & 0x3f)));
 }

 IR_FOLD(ROR(C_U8, C_U8))
 IR_FOLD(ROR(C_CHAR, C_CHAR))
 {
 	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
-	IR_FOLD_CONST_U(ir_ror8(op1_insn->val.u8, op2_insn->val.u8));
+	IR_FOLD_CONST_U(ir_ror8(op1_insn->val.u8, (op2_insn->val.u8 & 0x7)));
 }

 IR_FOLD(ROR(C_I8, C_I8))
 {
 	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
-	IR_FOLD_CONST_I((int8_t)ir_ror8(op1_insn->val.u8, op2_insn->val.u8));
+	IR_FOLD_CONST_I((int8_t)ir_ror8(op1_insn->val.u8, (op2_insn->val.u8 & 0x7)));
 }

 IR_FOLD(ROR(C_U16, C_U16))
 {
 	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
-	IR_FOLD_CONST_U(ir_ror16(op1_insn->val.u16, op2_insn->val.u16));
+	IR_FOLD_CONST_U(ir_ror16(op1_insn->val.u16, (op2_insn->val.u16 & 0xf)));
 }

 IR_FOLD(ROR(C_I16, C_I16))
 {
 	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
-	IR_FOLD_CONST_I((int16_t)ir_ror16(op1_insn->val.u16, op2_insn->val.u16));
+	IR_FOLD_CONST_I((int16_t)ir_ror16(op1_insn->val.u16, (op2_insn->val.u16 & 0xf)));
 }

 IR_FOLD(ROR(C_U32, C_U32))
 {
 	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
-	IR_FOLD_CONST_U(ir_ror32(op1_insn->val.u32, op2_insn->val.u32));
+	IR_FOLD_CONST_U(ir_ror32(op1_insn->val.u32, (op2_insn->val.u32 & 0x1f)));
 }

 IR_FOLD(ROR(C_I32, C_I32))
 {
 	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
-	IR_FOLD_CONST_I((int32_t)ir_ror32(op1_insn->val.u32, op2_insn->val.u32));
+	IR_FOLD_CONST_I((int32_t)ir_ror32(op1_insn->val.u32, (op2_insn->val.u32 & 0x1f)));
 }

 IR_FOLD(ROR(C_U64, C_U64))
 IR_FOLD(ROR(C_I64, C_I64))
 {
 	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
-	IR_FOLD_CONST_U(ir_ror64(op1_insn->val.u64, op2_insn->val.u64));
+	IR_FOLD_CONST_U(ir_ror64(op1_insn->val.u64, (op2_insn->val.u64 & 0x3f)));
 }

 //IR_FOLD(BSWAP(CONST))
@@ -1392,6 +1419,9 @@ IR_FOLD(TRUNC(C_U64))
 			IR_FOLD_CONST_U(op1_insn->val.u16);
 		case IR_U32:
 			IR_FOLD_CONST_U(op1_insn->val.u32);
+		case IR_ADDR:
+			IR_ASSERT(sizeof(void*) == 4);
+			IR_FOLD_CONST_U(op1_insn->val.u32);
 	}
 }

@@ -1545,7 +1575,7 @@ IR_FOLD(FP2FP(C_DOUBLE))
 IR_FOLD(COPY(_))
 {
 	IR_ASSERT(IR_OPT_TYPE(opt) == op1_insn->type);
-	if (!op2) {
+	if (!(op2 & IR_COPY_HARD)) {
 		IR_FOLD_COPY(op1);
 	}
 	/* skip CSE */
@@ -2075,23 +2105,23 @@ IR_FOLD(SUB(ADD, ADD))
 			IR_FOLD_CONST_U(0);
 		} else if (op1_insn->op1 == op2_insn->op1) {
 			/* (a + b) - (a + c) => b - c */
-			op1 = op1_insn->op2;
-			op2 = op2_insn->op2;
+			op1 = _ir_fold_cast(ctx, op1_insn->op2, IR_OPT_TYPE(opt));
+			op2 = _ir_fold_cast(ctx, op2_insn->op2, IR_OPT_TYPE(opt));
 			IR_FOLD_RESTART;
 		} else if (op1_insn->op1 == op2_insn->op2) {
 			/* (a + b) - (c + a) => b - c */
-			op1 = op1_insn->op2;
-			op2 = op2_insn->op1;
+			op1 = _ir_fold_cast(ctx, op1_insn->op2, IR_OPT_TYPE(opt));
+			op2 = _ir_fold_cast(ctx, op2_insn->op1, IR_OPT_TYPE(opt));
 			IR_FOLD_RESTART;
 		} else if (op1_insn->op2 == op2_insn->op1) {
 			/* (a + b) - (b + c) => a - c */
-			op1 = op1_insn->op1;
-			op2 = op2_insn->op2;
+			op1 = _ir_fold_cast(ctx, op1_insn->op1, IR_OPT_TYPE(opt));
+			op2 = _ir_fold_cast(ctx, op2_insn->op2, IR_OPT_TYPE(opt));
 			IR_FOLD_RESTART;
 		} else if (op1_insn->op2 == op2_insn->op2) {
 			/* (a + b) - (c + b) => a - c */
-			op1 = op1_insn->op1;
-			op2 = op2_insn->op1;
+			op1 = _ir_fold_cast(ctx, op1_insn->op1, IR_OPT_TYPE(opt));
+			op2 = _ir_fold_cast(ctx, op2_insn->op1, IR_OPT_TYPE(opt));
 			IR_FOLD_RESTART;
 		}
 	}
diff --git a/ext/opcache/jit/ir/ir_gcm.c b/ext/opcache/jit/ir/ir_gcm.c
index 043e1e7bdd8..e6486ba64a1 100644
--- a/ext/opcache/jit/ir/ir_gcm.c
+++ b/ext/opcache/jit/ir/ir_gcm.c
@@ -361,20 +361,20 @@ static bool ir_split_partially_dead_node(ir_ctx *ctx, ir_ref ref, uint32_t b)
 				while (ir_sparse_set_in(&data->totally_useful, ctx->cfg_blocks[j].idom)) {
 					j = ctx->cfg_blocks[j].idom;
 				}
+				clone = ir_hashtab_find(&hash, j);
+				if (clone == IR_INVALID_VAL) {
+					clone = clones_count++;
+					ir_hashtab_add(&hash, j, clone);
+					clones[clone].block = j;
+					clones[clone].use_count = 0;
+					clones[clone].use = -1;
+				}
+				uses[uses_count].ref = use;
+				uses[uses_count].block = i;
+				uses[uses_count].next = clones[clone].use;
+				clones[clone].use_count++;
+				clones[clone].use = uses_count++;
 			}
-			clone = ir_hashtab_find(&hash, j);
-			if (clone == IR_INVALID_VAL) {
-				clone = clones_count++;
-				ir_hashtab_add(&hash, j, clone);
-				clones[clone].block = j;
-				clones[clone].use_count = 0;
-				clones[clone].use = -1;
-			}
-			uses[uses_count].ref = use;
-			uses[uses_count].block = i;
-			uses[uses_count].next = clones[clone].use;
-			clones[clone].use_count++;
-			clones[clone].use = uses_count++;
 		}
 	}

@@ -1007,7 +1007,11 @@ int ir_schedule(ir_ctx *ctx)
 		start = i = bb->start;
 		_xlat[i] = bb->start = insns_count;
 		insn = &ctx->ir_base[i];
-		if (insn->op == IR_CASE_VAL) {
+		if (insn->op == IR_BEGIN) {
+			if (insn->op2) {
+				consts_count += ir_count_constant(_xlat, insn->op2);
+			}
+		} else if (insn->op == IR_CASE_VAL) {
 			IR_ASSERT(insn->op2 < IR_TRUE);
 			consts_count += ir_count_constant(_xlat, insn->op2);
 		} else if (insn->op == IR_CASE_RANGE) {
@@ -1255,7 +1259,7 @@ int ir_schedule(ir_ctx *ctx)
 						const char *proto = ir_get_strl(ctx, new_insn->proto, &len);
 						new_insn->proto = ir_strl(&new_ctx, proto, len);
 					}
-				} else if (new_insn->op == IR_SYM || new_insn->op == IR_STR) {
+				} else if (new_insn->op == IR_SYM || new_insn->op == IR_STR || new_insn->op == IR_LABEL) {
 					size_t len;
 					const char *str = ir_get_strl(ctx, new_insn->val.name, &len);
 					new_insn->val.u64 = ir_strl(&new_ctx, str, len);
@@ -1292,7 +1296,7 @@ int ir_schedule(ir_ctx *ctx)
 				} else {
 					new_insn->proto = 0;
 				}
-			} else if (insn->op == IR_SYM || insn->op == IR_STR) {
+			} else if (insn->op == IR_SYM || insn->op == IR_STR || insn->op == IR_LABEL) {
 				size_t len;
 				const char *str = ir_get_strl(ctx, insn->val.name, &len);
 				new_insn->val.u64 = ir_strl(&new_ctx, str, len);
@@ -1364,6 +1368,8 @@ int ir_schedule(ir_ctx *ctx)
 					size_t len;
 					const char *str = ir_get_strl(ctx, insn->op2, &len);
 					new_insn->op2 = ir_strl(&new_ctx, str, len);
+				} else if (new_insn->op == IR_BEGIN && insn->op2) {
+					new_insn->op2 = _xlat[insn->op2];
 				} else {
 					new_insn->op2 = insn->op2;
 				}
diff --git a/ext/opcache/jit/ir/ir_private.h b/ext/opcache/jit/ir/ir_private.h
index 2f457cbc993..dbacc3967d0 100644
--- a/ext/opcache/jit/ir/ir_private.h
+++ b/ext/opcache/jit/ir/ir_private.h
@@ -887,7 +887,7 @@ void ir_print_escaped_str(const char *s, size_t len, FILE *f);

 #define IR_IS_CONST_OP(op)       ((op) > IR_NOP && (op) <= IR_C_FLOAT)
 #define IR_IS_FOLDABLE_OP(op)    ((op) <= IR_LAST_FOLDABLE_OP)
-#define IR_IS_SYM_CONST(op)      ((op) == IR_STR || (op) == IR_SYM || (op) == IR_FUNC)
+#define IR_IS_SYM_CONST(op)      ((op) == IR_STR || (op) == IR_SYM || (op) == IR_FUNC || (op) == IR_LABEL)

 ir_ref ir_const_ex(ir_ctx *ctx, ir_val val, uint8_t type, uint32_t optx);

@@ -946,12 +946,13 @@ IR_ALWAYS_INLINE bool ir_ref_is_true(ir_ctx *ctx, ir_ref ref)
 #define IR_OPND_UNUSED            0x0
 #define IR_OPND_DATA              0x1
 #define IR_OPND_CONTROL           0x2
-#define IR_OPND_CONTROL_DEP       0x3
-#define IR_OPND_CONTROL_REF       0x4
-#define IR_OPND_STR               0x5
-#define IR_OPND_NUM               0x6
-#define IR_OPND_PROB              0x7
-#define IR_OPND_PROTO             0x8
+#define IR_OPND_LABEL_REF         0x3
+#define IR_OPND_CONTROL_DEP       0x4
+#define IR_OPND_CONTROL_REF       0x5
+#define IR_OPND_STR               0x6
+#define IR_OPND_NUM               0x7
+#define IR_OPND_PROB              0x8
+#define IR_OPND_PROTO             0x9

 #define IR_OP_FLAGS(op_flags, op1_flags, op2_flags, op3_flags) \
 	((op_flags) | ((op1_flags) << 20) | ((op2_flags) << 24) | ((op3_flags) << 28))
@@ -1013,6 +1014,7 @@ IR_ALWAYS_INLINE uint32_t ir_insn_len(const ir_insn *insn)
 #define IR_HAS_VA_ARG_FP       (1<<9)
 #define IR_HAS_FP_RET_SLOT     (1<<10)
 #define IR_16B_FRAME_ALIGNMENT (1<<11)
+#define IR_HAS_BLOCK_ADDR      (1<<12)

 /* Temporary: MEM2SSA -> SCCP */
 #define IR_MEM2SSA_VARS        (1<<25)
@@ -1248,11 +1250,10 @@ struct _ir_live_range {
 #define IR_LIVE_INTERVAL_HAS_HINT_REGS   (1<<2)
 #define IR_LIVE_INTERVAL_HAS_HINT_REFS   (1<<3)
 #define IR_LIVE_INTERVAL_MEM_PARAM       (1<<4)
-#define IR_LIVE_INTERVAL_MEM_LOAD        (1<<5)
-#define IR_LIVE_INTERVAL_COALESCED       (1<<6)
-#define IR_LIVE_INTERVAL_SPILL_SPECIAL   (1<<7) /* spill slot is pre-allocated in a special area (see ir_ctx.spill_reserved_base) */
-#define IR_LIVE_INTERVAL_SPILLED         (1<<8)
-#define IR_LIVE_INTERVAL_SPLIT_CHILD     (1<<9)
+#define IR_LIVE_INTERVAL_COALESCED       (1<<5)
+#define IR_LIVE_INTERVAL_SPILL_SPECIAL   (1<<6) /* spill slot is pre-allocated in a special area (see ir_ctx.spill_reserved_base) */
+#define IR_LIVE_INTERVAL_SPILLED         (1<<7)
+#define IR_LIVE_INTERVAL_SPLIT_CHILD     (1<<8)

 struct _ir_live_interval {
 	uint8_t           type;
diff --git a/ext/opcache/jit/ir/ir_ra.c b/ext/opcache/jit/ir/ir_ra.c
index 21c7ee3ac64..2e8a8e3f34f 100644
--- a/ext/opcache/jit/ir/ir_ra.c
+++ b/ext/opcache/jit/ir/ir_ra.c
@@ -776,9 +776,6 @@ int ir_compute_live_ranges(ir_ctx *ctx)
 						if (insn->op == IR_PARAM) {
 							/* We may reuse parameter stack slot for spilling */
 							ctx->live_intervals[v]->flags |= IR_LIVE_INTERVAL_MEM_PARAM;
-						} else if (insn->op == IR_VLOAD) {
-							/* Load may be fused into the usage instruction */
-							ctx->live_intervals[v]->flags |= IR_LIVE_INTERVAL_MEM_LOAD;
 						}
 						def_pos = IR_DEF_LIVE_POS_FROM_REF(ref);
 					}
@@ -845,11 +842,17 @@ int ir_compute_live_ranges(ir_ctx *ctx)
 							ival = ctx->live_intervals[v];
 						}
 						ir_add_use(ctx, ival, j, use_pos, reg, IR_USE_FLAGS(def_flags, j), hint_ref);
-					} else if (ctx->rules) {
-						if (ctx->rules[input] & IR_FUSED) {
-						    ir_add_fusion_ranges(ctx, ref, input, bb, live);
-						} else if (ctx->rules[input] == (IR_SKIPPED|IR_RLOAD)) {
-							ir_set_alocated_reg(ctx, ref, j, ctx->ir_base[input].op2);
+					} else {
+						if (ctx->rules) {
+							if ((ctx->rules[input] & (IR_FUSED|IR_SKIPPED)) == IR_FUSED) {
+								ir_add_fusion_ranges(ctx, ref, input, bb, live);
+							} else if (ctx->rules[input] == (IR_SKIPPED|IR_RLOAD)) {
+								ir_set_alocated_reg(ctx, ref, j, ctx->ir_base[input].op2);
+							}
+						}
+						if (reg != IR_REG_NONE) {
+							use_pos = IR_LOAD_LIVE_POS_FROM_REF(ref);
+							ir_add_fixed_live_range(ctx, reg, use_pos, use_pos + IR_USE_SUB_REF);
 						}
 					}
 				} else if (reg != IR_REG_NONE) {
@@ -1396,9 +1399,6 @@ int ir_compute_live_ranges(ir_ctx *ctx)
 						if (insn->op == IR_PARAM) {
 							/* We may reuse parameter stack slot for spilling */
 							ctx->live_intervals[v]->flags |= IR_LIVE_INTERVAL_MEM_PARAM;
-						} else if (insn->op == IR_VLOAD) {
-							/* Load may be fused into the usage instruction */
-							ctx->live_intervals[v]->flags |= IR_LIVE_INTERVAL_MEM_LOAD;
 						}
 						def_pos = IR_DEF_LIVE_POS_FROM_REF(ref);
 					}
@@ -1465,17 +1465,17 @@ int ir_compute_live_ranges(ir_ctx *ctx)
 							ival = ctx->live_intervals[v];
 						}
 						ir_add_use(ctx, ival, j, use_pos, reg, IR_USE_FLAGS(def_flags, j), hint_ref);
-					} else if (ctx->rules) {
-						if (ctx->rules[input] & IR_FUSED) {
-						    ir_add_fusion_ranges(ctx, ref, input, bb, live_in_block, b);
-						} else {
-							if (ctx->rules[input] == (IR_SKIPPED|IR_RLOAD)) {
+					} else {
+						if (ctx->rules) {
+							if ((ctx->rules[input] & (IR_FUSED|IR_SKIPPED)) == IR_FUSED) {
+								ir_add_fusion_ranges(ctx, ref, input, bb, live_in_block, b);
+							} else if (ctx->rules[input] == (IR_SKIPPED|IR_RLOAD)) {
 								ir_set_alocated_reg(ctx, ref, j, ctx->ir_base[input].op2);
 							}
-							if (reg != IR_REG_NONE) {
-								use_pos = IR_LOAD_LIVE_POS_FROM_REF(ref);
-								ir_add_fixed_live_range(ctx, reg, use_pos, use_pos + IR_USE_SUB_REF);
-							}
+						}
+						if (reg != IR_REG_NONE) {
+							use_pos = IR_LOAD_LIVE_POS_FROM_REF(ref);
+							ir_add_fixed_live_range(ctx, reg, use_pos, use_pos + IR_USE_SUB_REF);
 						}
 					}
 				} else if (reg != IR_REG_NONE) {
@@ -1605,7 +1605,7 @@ static void ir_vregs_join(ir_ctx *ctx, uint32_t r1, uint32_t r2)
 		}
 		while (*prev && ((*prev)->pos < use_pos->pos ||
 			((*prev)->pos == use_pos->pos &&
-				(use_pos->op_num == 0 || (*prev)->op_num < use_pos->op_num)))) {
+				(use_pos->op_num == 0 || ((*prev)->op_num != 0 && (*prev)->op_num < use_pos->op_num))))) {
 			if ((*prev)->hint_ref > 0 && ctx->vregs[(*prev)->hint_ref] == r2) {
 				(*prev)->hint_ref = 0;
 			}
@@ -1627,9 +1627,6 @@ static void ir_vregs_join(ir_ctx *ctx, uint32_t r1, uint32_t r2)

 	ctx->live_intervals[r1]->flags |=
 		IR_LIVE_INTERVAL_COALESCED | (ival->flags & (IR_LIVE_INTERVAL_HAS_HINT_REGS|IR_LIVE_INTERVAL_HAS_HINT_REFS));
-	if (ctx->ir_base[IR_LIVE_POS_TO_REF(ctx->live_intervals[r1]->use_pos->pos)].op != IR_VLOAD) {
-		ctx->live_intervals[r1]->flags &= ~IR_LIVE_INTERVAL_MEM_LOAD;
-	}
 	if (ival->flags & IR_LIVE_INTERVAL_MEM_PARAM) {
 		IR_ASSERT(!(ctx->live_intervals[r1]->flags & IR_LIVE_INTERVAL_MEM_PARAM));
 		ctx->live_intervals[r1]->flags |= IR_LIVE_INTERVAL_MEM_PARAM;
@@ -2343,16 +2340,6 @@ static ir_live_pos ir_first_use_pos_after(ir_live_interval *ival, ir_live_pos po
 	return p ? p->pos : 0x7fffffff;
 }

-static ir_live_pos ir_first_use_pos(ir_live_interval *ival, uint8_t flags)
-{
-	ir_use_pos *p = ival->use_pos;
-
-	while (p && !(p->flags & flags)) {
-		p = p->next;
-	}
-	return p ? p->pos : 0x7fffffff;
-}
-
 static ir_block *ir_block_from_live_pos(ir_ctx *ctx, ir_live_pos pos)
 {
 	ir_ref ref = IR_LIVE_POS_TO_REF(pos);
@@ -3194,7 +3181,6 @@ static ir_reg ir_allocate_blocked_reg(ir_ctx *ctx, ir_live_interval *ival, ir_li
 		/* split current before its first use position that requires a register */
 		ir_live_pos split_pos;

-spill_current:
 		if (next_use_pos == ival->range.start) {
 			IR_ASSERT(ival->use_pos && ival->use_pos->op_num == 0);
 			/* split right after definition */
@@ -3228,7 +3214,6 @@ static ir_reg ir_allocate_blocked_reg(ir_ctx *ctx, ir_live_interval *ival, ir_li
 			return IR_REG_NONE;
 		}
 		if (split_pos >= blockPos[reg]) {
-try_next_available_register:
 			IR_REGSET_EXCL(available, reg);
 			if (IR_REGSET_IS_EMPTY(available)) {
 				fprintf(stderr, "LSRA Internal Error: Unsolvable conflict. Allocation is not possible\n");
@@ -3274,23 +3259,6 @@ static ir_reg ir_allocate_blocked_reg(ir_ctx *ctx, ir_live_interval *ival, ir_li
 					}
 					IR_LOG_LSRA("      ---- Finish", other, "");
 				} else {
-					if (ir_first_use_pos(other, IR_USE_MUST_BE_IN_REG) <= other->end) {
-						if (!(ival->flags & IR_LIVE_INTERVAL_TEMP)) {
-							next_use_pos = ir_first_use_pos(ival, IR_USE_MUST_BE_IN_REG);
-							if (next_use_pos == ival->range.start) {
-								IR_ASSERT(ival->use_pos && ival->use_pos->op_num == 0);
-								/* split right after definition */
-								split_pos = next_use_pos + 1;
-							} else {
-								split_pos = ir_find_optimal_split_position(ctx, ival, ival->range.start, next_use_pos - 1, 1);
-							}
-
-							if (split_pos > ival->range.start) {
-								goto spill_current;
-							}
-						}
-						goto try_next_available_register;
-					}
 					child = other;
 					other->reg = IR_REG_NONE;
 					if (prev) {
@@ -3400,12 +3368,13 @@ static int ir_fix_dessa_tmps(ir_ctx *ctx, uint8_t type, ir_ref from, ir_ref to)
 static bool ir_ival_spill_for_fuse_load(ir_ctx *ctx, ir_live_interval *ival, ir_reg_alloc_data *data)
 {
 	ir_use_pos *use_pos = ival->use_pos;
-	ir_insn *insn;

 	if (ival->flags & IR_LIVE_INTERVAL_MEM_PARAM) {
 		IR_ASSERT(!ival->next && use_pos && use_pos->op_num == 0);
-		insn = &ctx->ir_base[IR_LIVE_POS_TO_REF(use_pos->pos)];
+#if IR_DEBUG
+		ir_insn *insn = &ctx->ir_base[IR_LIVE_POS_TO_REF(use_pos->pos)];
 		IR_ASSERT(insn->op == IR_PARAM);
+#endif
 		use_pos = use_pos->next;
 		if (use_pos && (use_pos->next || (use_pos->flags & IR_USE_MUST_BE_IN_REG))) {
 			return 0;
@@ -3418,38 +3387,6 @@ static bool ir_ival_spill_for_fuse_load(ir_ctx *ctx, ir_live_interval *ival, ir_
 			}
 		}

-		return 1;
-	} else if (ival->flags & IR_LIVE_INTERVAL_MEM_LOAD) {
-		insn = &ctx->ir_base[IR_LIVE_POS_TO_REF(use_pos->pos)];
-		IR_ASSERT(insn->op == IR_VLOAD);
-		IR_ASSERT(ctx->ir_base[insn->op2].op == IR_VAR);
-		use_pos = use_pos->next;
-		if (use_pos && (use_pos->next || (use_pos->flags & IR_USE_MUST_BE_IN_REG))) {
-			return 0;
-		}
-
-		if (use_pos) {
-			ir_block *bb = ir_block_from_live_pos(ctx, use_pos->pos);
-			if (bb->loop_depth && bb != ir_block_from_live_pos(ctx, ival->use_pos->pos)) {
-				return 0;
-			}
-			/* check if VAR may be clobbered between VLOAD and use */
-			ir_use_list *use_list = &ctx->use_lists[insn->op2];
-			ir_ref n = use_list->count;
-			ir_ref *p = &ctx->use_edges[use_list->refs];
-			for (; n > 0; p++, n--) {
-				ir_ref use = *p;
-				if (ctx->ir_base[use].op == IR_VSTORE) {
-					if (use > IR_LIVE_POS_TO_REF(ival->use_pos->pos) && use < IR_LIVE_POS_TO_REF(use_pos->pos)) {
-						return 0;
-					}
-				} else if (ctx->ir_base[use].op == IR_VADDR) {
-					return 0;
-				}
-			}
-		}
-		ival->stack_spill_pos = ctx->ir_base[insn->op2].op3;
-
 		return 1;
 	}
 	return 0;
@@ -3554,7 +3491,7 @@ static int ir_linear_scan(ir_ctx *ctx)
 	for (j = ctx->vregs_count; j != 0; j--) {
 		ival = ctx->live_intervals[j];
 		if (ival) {
-			if (!(ival->flags & (IR_LIVE_INTERVAL_MEM_PARAM|IR_LIVE_INTERVAL_MEM_LOAD))
+			if (!(ival->flags & IR_LIVE_INTERVAL_MEM_PARAM)
 					|| !ir_ival_spill_for_fuse_load(ctx, ival, &data)) {
 				ir_add_to_unhandled(&unhandled, ival);
 			}
diff --git a/ext/opcache/jit/ir/ir_save.c b/ext/opcache/jit/ir/ir_save.c
index 5ba986fadd4..dd955172950 100644
--- a/ext/opcache/jit/ir/ir_save.c
+++ b/ext/opcache/jit/ir/ir_save.c
@@ -40,6 +40,11 @@ void ir_print_proto_ex(uint8_t flags, ir_type ret_type, uint32_t params_count, c
 	} else if (flags & IR_BUILTIN_FUNC) {
 		fprintf(f, " __builtin");
 	}
+	if (flags & IR_CONST_FUNC) {
+		fprintf(f, " __const");
+	} else if (flags & IR_PURE_FUNC) {
+		fprintf(f, " __pure");
+	}
 }

 static void ir_save_dessa_moves(const ir_ctx *ctx, int b, ir_block *bb, FILE *f)
@@ -109,6 +114,10 @@ void ir_save(const ir_ctx *ctx, uint32_t save_flags, FILE *f)
 			fprintf(f, "sym(%s%s)",
 				(save_flags & IR_SAVE_SAFE_NAMES) ? "@" : "",
 				ir_get_str(ctx, insn->val.name));
+		} else if (insn->op == IR_LABEL) {
+			fprintf(f, "label(%s%s)",
+				(save_flags & IR_SAVE_SAFE_NAMES) ? "@" : "",
+				ir_get_str(ctx, insn->val.name));
 		} else if (insn->op == IR_FUNC_ADDR) {
 			fprintf(f, "func *");
 			ir_print_const(ctx, insn, f, true);
@@ -272,6 +281,13 @@ void ir_save(const ir_ctx *ctx, uint32_t save_flags, FILE *f)
 						fprintf(f, "%s%d", first ? "(" : ", ", ref);
 						first = 0;
 						break;
+					case IR_OPND_LABEL_REF:
+						if (ref) {
+							IR_ASSERT(IR_IS_CONST_REF(ref));
+							fprintf(f, "%sc_%d", first ? "(" : ", ", -ref);
+							first = 0;
+						}
+						break;
 				}
 			} else if (opnd_kind == IR_OPND_NUM) {
 				fprintf(f, "%s%d", first ? "(" : ", ", ref);
diff --git a/ext/opcache/jit/ir/ir_sccp.c b/ext/opcache/jit/ir/ir_sccp.c
index 45df92ec2be..e2f38a058ae 100644
--- a/ext/opcache/jit/ir/ir_sccp.c
+++ b/ext/opcache/jit/ir/ir_sccp.c
@@ -1508,8 +1508,8 @@ static bool ir_may_promote_f2d(ir_ctx *ctx, ir_ref ref)
 		switch (insn->op) {
 			case IR_FP2FP:
 				return 1;
-			case IR_INT2FP:
-				return ctx->use_lists[ref].count == 1;
+//			case IR_INT2FP:
+//				return ctx->use_lists[ref].count == 1;
 			case IR_NEG:
 			case IR_ABS:
 				return ctx->use_lists[ref].count == 1 &&
@@ -2110,7 +2110,9 @@ static bool ir_try_promote_induction_var_ext(ir_ctx *ctx, ir_ref ext_ref, ir_ref
 				 && !IR_IS_SYM_CONST(ctx->ir_base[use_insn->op1].op)) {
 					ctx->ir_base[use].op1 = ir_ext_const(ctx, &ctx->ir_base[use_insn->op1], op, type);
 				} else {
-					ctx->ir_base[use].op1 = ir_ext_ref(ctx, use, use_insn->op1, op, type, worklist);
+					ir_ref tmp = ir_ext_ref(ctx, use, use_insn->op1, op, type, worklist);
+					use_insn = &ctx->ir_base[use];
+					use_insn->op1 = tmp;
 				}
 				ir_bitqueue_add(worklist, use);
 			}
@@ -2119,7 +2121,9 @@ static bool ir_try_promote_induction_var_ext(ir_ctx *ctx, ir_ref ext_ref, ir_ref
 				 && !IR_IS_SYM_CONST(ctx->ir_base[use_insn->op2].op)) {
 					ctx->ir_base[use].op2 = ir_ext_const(ctx, &ctx->ir_base[use_insn->op2], op, type);
 				} else {
-					ctx->ir_base[use].op2 = ir_ext_ref(ctx, use, use_insn->op2, op, type, worklist);
+					ir_ref tmp = ir_ext_ref(ctx, use, use_insn->op2, op, type, worklist);
+					use_insn = &ctx->ir_base[use];
+					use_insn->op2 = tmp;
 				}
 				ir_bitqueue_add(worklist, use);
 			}
@@ -2147,7 +2151,9 @@ static bool ir_try_promote_induction_var_ext(ir_ctx *ctx, ir_ref ext_ref, ir_ref
 					 && !IR_IS_SYM_CONST(ctx->ir_base[use_insn->op1].op)) {
 						ctx->ir_base[use].op1 = ir_ext_const(ctx, &ctx->ir_base[use_insn->op1], op, type);
 					} else {
-						ctx->ir_base[use].op1 = ir_ext_ref(ctx, use, use_insn->op1, op, type, worklist);
+						ir_ref tmp = ir_ext_ref(ctx, use, use_insn->op1, op, type, worklist);
+						use_insn = &ctx->ir_base[use];
+						use_insn->op1 = tmp;
 					}
 					ir_bitqueue_add(worklist, use);
 				}
@@ -2156,7 +2162,9 @@ static bool ir_try_promote_induction_var_ext(ir_ctx *ctx, ir_ref ext_ref, ir_ref
 					 && !IR_IS_SYM_CONST(ctx->ir_base[use_insn->op2].op)) {
 						ctx->ir_base[use].op2 = ir_ext_const(ctx, &ctx->ir_base[use_insn->op2], op, type);
 					} else {
-						ctx->ir_base[use].op2 = ir_ext_ref(ctx, use, use_insn->op2, op, type, worklist);
+						ir_ref tmp = ir_ext_ref(ctx, use, use_insn->op2, op, type, worklist);
+						use_insn = &ctx->ir_base[use];
+						use_insn->op2 = tmp;
 					}
 					ir_bitqueue_add(worklist, use);
 				}
@@ -2178,7 +2186,8 @@ static bool ir_try_promote_induction_var_ext(ir_ctx *ctx, ir_ref ext_ref, ir_ref
 	 && !IR_IS_SYM_CONST(ctx->ir_base[phi_insn->op2].op)) {
 		ctx->ir_base[phi_ref].op2 = ir_ext_const(ctx, &ctx->ir_base[phi_insn->op2], op, type);
 	} else {
-		ctx->ir_base[phi_ref].op2 = ir_ext_ref(ctx, phi_ref, phi_insn->op2, op, type, worklist);
+		ir_ref tmp = ir_ext_ref(ctx, phi_ref, phi_insn->op2, op, type, worklist);
+		ctx->ir_base[phi_ref].op2 = tmp;
 	}

 	return 1;
@@ -2251,42 +2260,6 @@ static void ir_merge_blocks(ir_ctx *ctx, ir_ref end, ir_ref begin, ir_bitqueue *
 	ir_ref prev, next;
 	ir_use_list *use_list;

-	if (ctx->use_lists[begin].count > 1) {
-		ir_ref *p, n, i, use;
-		ir_insn *use_insn;
-		ir_ref region = end;
-		ir_ref next = IR_UNUSED;
-
-		while (!IR_IS_BB_START(ctx->ir_base[region].op)) {
-			region = ctx->ir_base[region].op1;
-		}
-
-		use_list = &ctx->use_lists[begin];
-		n = use_list->count;
-		for (p = &ctx->use_edges[use_list->refs], i = 0; i < n; p++, i++) {
-			use = *p;
-			use_insn = &ctx->ir_base[use];
-			if (ir_op_flags[use_insn->op] & IR_OP_FLAG_CONTROL) {
-				IR_ASSERT(!next);
-				next = use;
-			} else {
-				IR_ASSERT(use_insn->op == IR_VAR);
-				IR_ASSERT(use_insn->op1 == begin);
-				use_insn->op1 = region;
-				if (ir_use_list_add(ctx, region, use)) {
-					/* restore after reallocation */
-					use_list = &ctx->use_lists[begin];
-					n = use_list->count;
-					p = &ctx->use_edges[use_list->refs + i];
-				}
-			}
-		}
-
-		IR_ASSERT(next);
-		ctx->use_edges[use_list->refs] = next;
-		use_list->count = 1;
-	}
-
 	IR_ASSERT(ctx->ir_base[begin].op == IR_BEGIN);
 	IR_ASSERT(ctx->ir_base[end].op == IR_END);
 	IR_ASSERT(ctx->ir_base[begin].op1 == end);
@@ -3595,7 +3568,10 @@ void ir_iter_opt(ir_ctx *ctx, ir_bitqueue *worklist)
 			if (!(ctx->flags & IR_OPT_CFG)) {
 				/* pass */
 			} else if (insn->op == IR_BEGIN) {
-				if (insn->op1 && ctx->ir_base[insn->op1].op == IR_END) {
+				if (insn->op1
+				 && !insn->op2 /* no computed goto label */
+				 && ctx->use_lists[i].count == 1
+				 && ctx->ir_base[insn->op1].op == IR_END) {
 					ir_merge_blocks(ctx, insn->op1, i, worklist);
 				}
 			} else if (insn->op == IR_MERGE) {
diff --git a/ext/opcache/jit/ir/ir_x86.dasc b/ext/opcache/jit/ir/ir_x86.dasc
index 42e4eee7da0..7f714dd11d2 100644
--- a/ext/opcache/jit/ir/ir_x86.dasc
+++ b/ext/opcache/jit/ir/ir_x86.dasc
@@ -66,7 +66,7 @@ IR_ALWAYS_INLINE ir_mem IR_MEM(ir_reg base, int32_t offset, ir_reg index, int32_

 #define IR_SPILL_POS_TO_OFFSET(offset) \
 	((ctx->flags & IR_USE_FRAME_POINTER) ? \
-		((offset) - (ctx->stack_frame_size - ctx->stack_frame_alignment)) : \
+		((offset) - ctx->stack_frame_size) : \
 		((offset) + ctx->call_stack_size))

 |.macro ASM_EXPAND_OP_MEM, MACRO, op, type, op1
@@ -892,6 +892,9 @@ typedef struct _ir_backend_data {
 	bool               double_abs_const;
 	bool               float_abs_const;
 	bool               double_zero_const;
+	bool               u2d_const;
+	bool               u2f_const;
+	bool               resolved_label_syms;
 } ir_backend_data;

 #define IR_GP_REG_NAME(code, name64, name32, name16, name8, name8h) \
@@ -1087,6 +1090,7 @@ const char *ir_reg_name(int8_t reg, ir_type type)
 	_(SSE_TRUNC)           \
 	_(SSE_NEARBYINT)       \
 	_(BIT_OP)              \
+	_(IGOTO_DUP)           \

 #define IR_LEA_FIRST IR_LEA_OB
 #define IR_LEA_LAST  IR_LEA_O_SYM
@@ -1110,35 +1114,24 @@ const char *ir_rule_name[IR_LAST_OP] = {

 static bool ir_may_fuse_addr(ir_ctx *ctx, const ir_insn *addr_insn)
 {
-	if (sizeof(void*) == 4) {
-		return 1;
+	if (addr_insn->op == IR_LABEL) {
+		return 0;
 	} else if (IR_IS_SYM_CONST(addr_insn->op)) {
 		void *addr = ir_sym_addr(ctx, addr_insn);

 		if (!addr) {
 			return 0;
 		}
-		return IR_IS_SIGNED_32BIT((int64_t)(intptr_t)addr);
+		return (sizeof(void*) == 4) || IR_IS_SIGNED_32BIT((int64_t)(intptr_t)addr);
 	} else {
-		return IR_IS_SIGNED_32BIT(addr_insn->val.i64);
+		return (sizeof(void*) == 4) || IR_IS_SIGNED_32BIT(addr_insn->val.i64);
 	}
 }

 static bool ir_may_fuse_imm(ir_ctx *ctx, const ir_insn *val_insn)
 {
 	if (val_insn->type == IR_ADDR) {
-		if (sizeof(void*) == 4) {
-			return 1;
-		} else if (IR_IS_SYM_CONST(val_insn->op)) {
-			void *addr = ir_sym_addr(ctx, val_insn);
-
-			if (!addr) {
-				return 0;
-			}
-			return IR_IS_SIGNED_32BIT((intptr_t)addr);
-		} else {
-			return IR_IS_SIGNED_32BIT(val_insn->val.i64);
-		}
+		return ir_may_fuse_addr(ctx, val_insn);
 	} else {
 		return (ir_type_size[val_insn->type] <= 4 || IR_IS_SIGNED_32BIT(val_insn->val.i64));
 	}
@@ -1517,6 +1510,11 @@ op2_const:
 				constraints->tmp_regs[0] = IR_TMP_REG(1, ctx->ir_base[insn->op1].type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
 				n = 1;
 			}
+			if (IR_IS_TYPE_UNSIGNED(ctx->ir_base[insn->op1].type)
+			 && ir_type_size[ctx->ir_base[insn->op1].type] >= sizeof(void*)) {
+				constraints->tmp_regs[n] = IR_TMP_REG(2, ctx->ir_base[insn->op1].type, IR_USE_SUB_REF, IR_DEF_SUB_REF);
+				n++;
+			}
 			break;
 		case IR_ABS_INT:
 			flags = IR_DEF_CONFLICTS_WITH_INPUT_REGS | IR_USE_MUST_BE_IN_REG | IR_OP1_MUST_BE_IN_REG;
@@ -1542,6 +1540,7 @@ op2_const:
 		case IR_GUARD_NOT:
 			flags = IR_OP2_SHOULD_BE_IN_REG;
 			break;
+		case IR_IGOTO:
 		case IR_IJMP:
 			flags = IR_OP2_SHOULD_BE_IN_REG;
 			break;
@@ -1574,7 +1573,7 @@ op2_const:
 			}
 			break;
 		case IR_VA_ARG:
-			flags = IR_USE_MUST_BE_IN_REG | IR_OP2_MUST_BE_IN_REG;
+			flags = IR_USE_MUST_BE_IN_REG | IR_OP2_MUST_BE_IN_REG | IR_DEF_CONFLICTS_WITH_INPUT_REGS;
 			constraints->tmp_regs[0] = IR_TMP_REG(3, IR_ADDR, IR_LOAD_SUB_REF, IR_SAVE_SUB_REF);
 			n = 1;
 			insn = &ctx->ir_base[ref];
@@ -1669,7 +1668,9 @@ static void ir_match_fuse_addr(ir_ctx *ctx, ir_ref addr_ref)

 				do {
 					ir_insn *insn = &ctx->ir_base[*p];
-					if (insn->op != IR_LOAD && (insn->op != IR_STORE || insn->op3 == addr_ref)) {
+					if (insn->op != IR_LOAD
+					 && insn->op != IR_LOAD_v
+					 && ((insn->op != IR_STORE && insn->op != IR_STORE_v) || insn->op3 == addr_ref)) {
 						return;
 					}
 					p++;
@@ -1752,7 +1753,7 @@ static bool ir_match_has_mem_deps(ir_ctx *ctx, ir_ref ref, ir_ref root)
 		do {
 			ir_insn *insn = &ctx->ir_base[pos];

-			if (insn->op == IR_STORE) {
+			if (insn->op == IR_STORE || insn->op == IR_STORE_v || insn->op == IR_VSTORE || insn->op == IR_VSTORE_v) {
 				// TODO: check if LOAD and STORE addresses may alias
 				return 1;
 			} else if (insn->op == IR_CALL) {
@@ -1766,8 +1767,9 @@ static bool ir_match_has_mem_deps(ir_ctx *ctx, ir_ref ref, ir_ref root)

 static void ir_match_fuse_load(ir_ctx *ctx, ir_ref ref, ir_ref root)
 {
-	if (ir_in_same_block(ctx, ref)
-	 && ctx->ir_base[ref].op == IR_LOAD) {
+	if (ir_in_same_block(ctx, ref) &&
+	    (ctx->ir_base[ref].op == IR_LOAD || ctx->ir_base[ref].op == IR_LOAD_v ||
+	     ctx->ir_base[ref].op == IR_VLOAD || ctx->ir_base[ref].op == IR_VLOAD_v)) {
 		if (ctx->use_lists[ref].count == 2
 		 && !ir_match_has_mem_deps(ctx, ref, root)) {
 			ir_ref addr_ref = ctx->ir_base[ref].op2;
@@ -1792,7 +1794,7 @@ static bool ir_match_try_fuse_load(ir_ctx *ctx, ir_ref ref, ir_ref root)
 	ir_insn *insn = &ctx->ir_base[ref];

 	if (ir_in_same_block(ctx, ref)
-	 && insn->op == IR_LOAD) {
+	 && (insn->op == IR_LOAD || insn->op == IR_LOAD_v || insn->op == IR_VLOAD || insn->op == IR_VLOAD_v)) {
 		if (ctx->use_lists[ref].count == 2
 		 && !ir_match_has_mem_deps(ctx, ref, root)) {
 			ir_ref addr_ref = ctx->ir_base[ref].op2;
@@ -1814,8 +1816,6 @@ static bool ir_match_try_fuse_load(ir_ctx *ctx, ir_ref ref, ir_ref root)
 		 && ir_get_param_reg(ctx, ref) == IR_REG_NONE) {
 			return 1;
 		}
-	} else if (ctx->ir_base[ref].op == IR_VLOAD) {
-		return 1;
 	}
 	return 0;
 }
@@ -2462,8 +2462,21 @@ binop_fp:
 		case IR_IJMP:
 			ir_match_fuse_load(ctx, insn->op2, ref);
 			return insn->op;
+		case IR_IGOTO:
+			if (ctx->ir_base[insn->op1].op == IR_MERGE || ctx->ir_base[insn->op1].op == IR_LOOP_BEGIN) {
+				ir_insn *merge = &ctx->ir_base[insn->op1];
+				ir_ref *p, n = merge->inputs_count;
+
+				for (p = merge->ops + 1; n > 0; p++, n--) {
+					ir_ref input = *p;
+					IR_ASSERT(ctx->ir_base[input].op == IR_END || ctx->ir_base[input].op == IR_LOOP_END);
+					ctx->rules[input] = IR_IGOTO_DUP;
+				}
+			}
+			ir_match_fuse_load(ctx, insn->op2, ref);
+			return insn->op;
 		case IR_VAR:
-			return IR_SKIPPED | IR_VAR;
+			return IR_STATIC_ALLOCA;
 		case IR_PARAM:
 #ifndef _WIN64
 			if (ctx->value_params && ctx->value_params[insn->op3 - 1].align) {
@@ -2617,7 +2630,15 @@ store_int:
 				return IR_VSTORE_FP;
 			}
 			break;
+		case IR_VSTORE_v:
+			if (IR_IS_TYPE_INT(ctx->ir_base[insn->op3].type)) {
+				return IR_VSTORE_INT;
+			} else {
+				return IR_VSTORE_FP;
+			}
+			break;
 		case IR_LOAD:
+		case IR_LOAD_v:
 			ir_match_fuse_addr(ctx, insn->op2);
 			if (IR_IS_TYPE_INT(insn->type)) {
 				return IR_LOAD_INT;
@@ -2635,6 +2656,14 @@ store_int:
 				return IR_STORE_FP;
 			}
 			break;
+		case IR_STORE_v:
+			ir_match_fuse_addr(ctx, insn->op2);
+			if (IR_IS_TYPE_INT(ctx->ir_base[insn->op3].type)) {
+				return IR_STORE_INT;
+			} else {
+				return IR_STORE_FP;
+			}
+			break;
 		case IR_RLOAD:
 			if (IR_REGSET_IN(IR_REGSET_UNION((ir_regset)ctx->fixed_regset, IR_REGSET_FIXED), insn->op2)) {
 				return IR_SKIPPED | IR_RLOAD;
@@ -3175,7 +3204,7 @@ static void ir_emit_load_imm_fp(ir_ctx *ctx, ir_type type, ir_reg reg, ir_ref sr
 			|	xorpd xmm(reg-IR_REG_FP_FIRST), xmm(reg-IR_REG_FP_FIRST)
 		}
 	} else {
-		label = ir_const_label(ctx, src);
+		label = ir_get_const_label(ctx, src);
 		|	ASM_FP_REG_TXT_OP movs, type, reg, [=>label]
 	}
 }
@@ -3229,6 +3258,38 @@ static void ir_load_local_addr(ir_ctx *ctx, ir_reg reg, ir_ref src)
 	}
 }

+static void ir_resolve_label_syms(ir_ctx *ctx)
+{
+	uint32_t b;
+	ir_block *bb;
+
+	for (b = 1, bb = &ctx->cfg_blocks[b]; b <= ctx->cfg_blocks_count; bb++, b++) {
+		ir_insn *insn = &ctx->ir_base[bb->start];
+
+		if (insn->op == IR_BEGIN && insn->op2) {
+			IR_ASSERT(ctx->ir_base[insn->op2].op == IR_LABEL);
+			ctx->ir_base[insn->op2].val.u32_hi = b;
+		}
+	}
+}
+
+static void ir_emit_load_label_addr(ir_ctx *ctx, ir_reg reg, ir_insn *label)
+{
+	ir_backend_data *data = ctx->data;
+	dasm_State **Dst = &data->dasm_state;
+
+	if (!data->resolved_label_syms) {
+		data->resolved_label_syms = 1;
+		ir_resolve_label_syms(ctx);
+	}
+
+	IR_ASSERT(label->op == IR_LABEL);
+	int b = label->val.u32_hi;
+
+	b = ir_skip_empty_target_blocks(ctx, b);
+	|	lea Ra(reg), aword [=>b]
+}
+
 static void ir_emit_load(ir_ctx *ctx, ir_type type, ir_reg reg, ir_ref src)
 {
 	if (IR_IS_CONST_REF(src)) {
@@ -3241,9 +3302,11 @@ static void ir_emit_load(ir_ctx *ctx, ir_type type, ir_reg reg, ir_ref src)
 			} else if (insn->op == IR_STR) {
 				ir_backend_data *data = ctx->data;
 				dasm_State **Dst = &data->dasm_state;
-				int label = ir_const_label(ctx, src);
+				int label = ir_get_const_label(ctx, src);

 				|	lea Ra(reg), aword [=>label]
+			} else if (insn->op == IR_LABEL) {
+				ir_emit_load_label_addr(ctx, reg, insn);
 			} else {
 				ir_emit_load_imm_int(ctx, type, reg, insn->val.i64);
 			}
@@ -3289,7 +3352,7 @@ static void ir_emit_store_mem_int_const(ir_ctx *ctx, ir_type type, ir_mem mem, i

 	IR_ASSERT(IR_IS_CONST_REF(src));
 	if (val_insn->op == IR_STR) {
-		int label = ir_const_label(ctx, src);
+		int label = ir_get_const_label(ctx, src);

 		IR_ASSERT(tmp_reg != IR_REG_NONE);
 |.if X64
@@ -3298,6 +3361,11 @@ static void ir_emit_store_mem_int_const(ir_ctx *ctx, ir_type type, ir_mem mem, i
 |.else
 		|	ASM_TMEM_TXT_OP mov, aword, mem, =>label
 |.endif
+	} else if (val_insn->op == IR_LABEL) {
+		IR_ASSERT(tmp_reg != IR_REG_NONE);
+		tmp_reg = IR_REG_NUM(tmp_reg);
+		ir_emit_load_label_addr(ctx, tmp_reg, val_insn);
+		ir_emit_store_mem_int(ctx, type, mem, tmp_reg);
 	} else {
 		int64_t val = val_insn->val.i64;

@@ -3726,7 +3794,8 @@ static ir_mem ir_fuse_load(ir_ctx *ctx, ir_ref root, ir_ref ref)
 	ir_insn *load_insn = &ctx->ir_base[ref];
 	ir_reg reg;

-	IR_ASSERT(load_insn->op == IR_LOAD);
+	IR_ASSERT(load_insn->op == IR_LOAD || load_insn->op == IR_LOAD_v ||
+		load_insn->op == IR_VLOAD || load_insn->op == IR_VLOAD_v);
 	if (UNEXPECTED(ctx->rules[ref] & IR_FUSED_REG)) {
 		reg = ir_get_fused_reg(ctx, root, ref * sizeof(ir_ref) + 2);
 	} else {
@@ -3762,9 +3831,11 @@ static void ir_emit_load_ex(ir_ctx *ctx, ir_type type, ir_reg reg, ir_ref src, i
 			} else if (insn->op == IR_STR) {
 				ir_backend_data *data = ctx->data;
 				dasm_State **Dst = &data->dasm_state;
-				int label = ir_const_label(ctx, src);
+				int label = ir_get_const_label(ctx, src);

 				|	lea Ra(reg), aword [=>label]
+			} else if (insn->op == IR_LABEL) {
+				ir_emit_load_label_addr(ctx, reg, insn);
 			} else {
 				ir_emit_load_imm_int(ctx, type, reg, insn->val.i64);
 			}
@@ -3862,7 +3933,7 @@ static void ir_emit_prologue(ir_ctx *ctx)
 		if (ctx->flags & IR_USE_FRAME_POINTER) {
 			fp = IR_REG_FRAME_POINTER;

-			offset = -(ctx->stack_frame_size - ctx->stack_frame_alignment - ctx->locals_area_size);
+			offset = -(ctx->stack_frame_size - ctx->locals_area_size);
 		} else {
 			fp = IR_REG_STACK_POINTER;
 			offset = ctx->locals_area_size + ctx->call_stack_size;
@@ -5607,7 +5678,7 @@ static void ir_emit_binop_sse2(ir_ctx *ctx, ir_ref def, ir_insn *insn)
 				break;
 		}
 	} else if (IR_IS_CONST_REF(op2)) {
-		int label = ir_const_label(ctx, op2);
+		int label = ir_get_const_label(ctx, op2);

 		switch (insn->op) {
 			default:
@@ -5714,7 +5785,7 @@ static void ir_emit_binop_avx(ir_ctx *ctx, ir_ref def, ir_insn *insn)
 				break;
 		}
 	} else if (IR_IS_CONST_REF(op2)) {
-		int label = ir_const_label(ctx, op2);
+		int label = ir_get_const_label(ctx, op2);

 		switch (insn->op) {
 			default:
@@ -6126,7 +6197,7 @@ static ir_op ir_emit_cmp_fp_common(ir_ctx *ctx, ir_ref root, ir_ref cmp_ref, ir_
 		}
 		|	ASM_FP_REG_REG_OP ucomis, type, op1_reg, op2_reg
 	} else if (IR_IS_CONST_REF(op2)) {
-		int label = ir_const_label(ctx, op2);
+		int label = ir_get_const_label(ctx, op2);

 		|	ASM_FP_REG_TXT_OP ucomis, type, op1_reg, [=>label]
 	} else {
@@ -6975,7 +7046,7 @@ static void ir_emit_return_fp(ir_ctx *ctx, ir_ref ref, ir_insn *insn)
 		} else if ((type == IR_FLOAT && value->val.f == 1.0) || (type == IR_DOUBLE && value->val.d == 1.0)) {
 			|	fld1
 		} else {
-			int label = ir_const_label(ctx, insn->op2);
+			int label = ir_get_const_label(ctx, insn->op2);

 			if (type == IR_DOUBLE) {
 				|	fld qword [=>label]
@@ -7260,7 +7331,20 @@ static void ir_emit_trunc(ir_ctx *ctx, ir_ref def, ir_insn *insn)
 			ir_emit_load(ctx, src_type, op1_reg, insn->op1);
 		}
 		if (op1_reg != def_reg) {
+#ifdef IR_TARGET_X86
+			if (ir_type_size[dst_type] == 1
+			 && (op1_reg == IR_REG_RBP || op1_reg == IR_REG_RSI || op1_reg == IR_REG_RDI)) {
+				ir_backend_data *data = ctx->data;
+				dasm_State **Dst = &data->dasm_state;
+
+				ir_emit_mov(ctx, src_type, def_reg, op1_reg);
+				|	and	Rb(def_reg), 0xff
+			} else {
+				ir_emit_mov(ctx, dst_type, def_reg, op1_reg);
+			}
+#else
 			ir_emit_mov(ctx, dst_type, def_reg, op1_reg);
+#endif
 		}
 	} else {
 		ir_emit_load_ex(ctx, dst_type, def_reg, insn->op1, def);
@@ -7385,7 +7469,7 @@ static void ir_emit_bitcast(ir_ctx *ctx, ir_ref def, ir_insn *insn)
 				}
 			}
 		} else if (IR_IS_CONST_REF(insn->op1)) {
-			int label = ir_const_label(ctx, insn->op1);
+			int label = ir_get_const_label(ctx, insn->op1);

 			|	ASM_FP_REG_TXT_OP movs, dst_type, def_reg, [=>label]
 		} else {
@@ -7417,13 +7501,80 @@ static void ir_emit_int2fp(ir_ctx *ctx, ir_ref def, ir_insn *insn)
 	IR_ASSERT(IR_IS_TYPE_INT(src_type));
 	IR_ASSERT(IR_IS_TYPE_FP(dst_type));
 	IR_ASSERT(def_reg != IR_REG_NONE);
+
+	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
+		op1_reg = IR_REG_NUM(op1_reg);
+		ir_emit_load(ctx, src_type, op1_reg, insn->op1);
+	}
+
+	if (IR_IS_TYPE_UNSIGNED(src_type) && ir_type_size[src_type] >= sizeof(void*)) {
+		ir_reg tmp_reg = ctx->regs[def][2];
+
+		IR_ASSERT(tmp_reg != IR_REG_NONE);
+		if (op1_reg == IR_REG_NONE) {
+			if (IR_IS_CONST_REF(insn->op1)) {
+				IR_ASSERT(0);
+			} else {
+				ir_mem mem;
+
+				if (ir_rule(ctx, insn->op1) & IR_FUSED) {
+					mem = ir_fuse_load(ctx, def, insn->op1);
+				} else {
+					mem = ir_ref_spill_slot(ctx, insn->op1);
+				}
+				ir_emit_load_mem_int(ctx, src_type, tmp_reg, mem);
+				op1_reg = tmp_reg;
+			}
+		}
+		if (sizeof(void*) == 4) {
+			if (tmp_reg == op1_reg) {
+				| add Rd(op1_reg), 0x80000000
+			} else {
+				| lea Rd(tmp_reg), dword [Rd(op1_reg)+0x80000000]
+				op1_reg = tmp_reg;
+			}
+		} else {
+|.if X64
+			|	test Rq(op1_reg), Rq(op1_reg)
+			|	js >1
+			|.cold_code
+			|1:
+			if (tmp_reg != op1_reg) {
+				| mov Rq(tmp_reg), Rq(op1_reg)
+			}
+			|	shr	Rq(tmp_reg), 1
+			|	adc Rq(tmp_reg), 0
+			if (dst_type == IR_DOUBLE) {
+				if (ctx->mflags & IR_X86_AVX) {
+					|	vxorps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
+					|	vcvtsi2sd xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), Rq(tmp_reg)
+					|	vaddsd xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
+				} else {
+					|	pxor xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
+					|	cvtsi2sd xmm(def_reg-IR_REG_FP_FIRST), Rq(tmp_reg)
+					|	addsd xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
+				}
+			} else {
+				IR_ASSERT(dst_type == IR_FLOAT);
+				if (ctx->mflags & IR_X86_AVX) {
+					|	vxorps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
+					|	vcvtsi2ss xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), Rq(tmp_reg)
+					|	vaddss xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
+				} else {
+					|	pxor xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
+					|	cvtsi2ss xmm(def_reg-IR_REG_FP_FIRST), Rq(tmp_reg)
+					|	addss xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
+				}
+			}
+			|	jmp >2
+			|.code
+|.endif
+		}
+	}
+
 	if (op1_reg != IR_REG_NONE) {
 		bool src64 = 0;

-		if (IR_REG_SPILLED(op1_reg)) {
-			op1_reg = IR_REG_NUM(op1_reg);
-			ir_emit_load(ctx, src_type, op1_reg, insn->op1);
-		}
 		if (IR_IS_TYPE_SIGNED(src_type)) {
 			if (ir_type_size[src_type] < 4) {
 |.if X64
@@ -7462,7 +7613,6 @@ static void ir_emit_int2fp(ir_ctx *ctx, ir_ref def, ir_insn *insn)
 ||				}
 |.endif
 			} else {
-				// TODO: uint64_t -> double
 				src64 = 1;
 			}
 		}
@@ -7508,6 +7658,40 @@ static void ir_emit_int2fp(ir_ctx *ctx, ir_ref def, ir_insn *insn)
 			}
 |.endif
 		}
+		|2:
+		if (sizeof(void*) == 4 && IR_IS_TYPE_UNSIGNED(src_type) && ir_type_size[src_type] >= sizeof(void*)) {
+			if (dst_type == IR_DOUBLE) {
+				uint32_t c = (sizeof(void*) == 4) ? 0x41e00000 : 0x43e00000;
+				if (!data->u2d_const) {
+					data->u2d_const = 1;
+					ir_rodata(ctx);
+					|.align 8
+					|->u2d_const:
+					|.dword 0, c
+					|.code
+				}
+				if (ctx->mflags & IR_X86_AVX) {
+					|	vaddsd	xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), qword [->u2d_const]
+				} else {
+					|	addsd	xmm(def_reg-IR_REG_FP_FIRST), qword [->u2d_const]
+				}
+			} else {
+				uint32_t c = (sizeof(void*) == 4) ? 0x4f000000 : 0x5f000000;
+				if (!data->u2f_const) {
+					data->u2f_const = 1;
+					ir_rodata(ctx);
+					|.align 4
+					|->u2f_const:
+					|.dword c
+					|.code
+				}
+				if (ctx->mflags & IR_X86_AVX) {
+					|	vaddss	xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), dword [->u2f_const]
+				} else {
+					|	addss	xmm(def_reg-IR_REG_FP_FIRST), dword [->u2f_const]
+				}
+			}
+		}
 	} else if (IR_IS_CONST_REF(insn->op1)) {
 		IR_ASSERT(0);
 	} else {
@@ -7625,7 +7809,7 @@ static void ir_emit_fp2int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
 |.endif
 		}
 	} else if (IR_IS_CONST_REF(insn->op1)) {
-		int label = ir_const_label(ctx, insn->op1);
+		int label = ir_get_const_label(ctx, insn->op1);

 		if (!dst64) {
 			if (src_type == IR_DOUBLE) {
@@ -7746,7 +7930,7 @@ static void ir_emit_fp2fp(ir_ctx *ctx, ir_ref def, ir_insn *insn)
 			}
 		}
 	} else if (IR_IS_CONST_REF(insn->op1)) {
-		int label = ir_const_label(ctx, insn->op1);
+		int label = ir_get_const_label(ctx, insn->op1);

 		if (src_type == IR_DOUBLE) {
 			if (ctx->mflags & IR_X86_AVX) {
@@ -8429,7 +8613,7 @@ static void ir_emit_va_start(ir_ctx *ctx, ir_ref def, ir_insn *insn)

 	if (ctx->flags & IR_USE_FRAME_POINTER) {
 		fp = IR_REG_FRAME_POINTER;
-		reg_save_area_offset = -(ctx->stack_frame_size - ctx->stack_frame_alignment - ctx->locals_area_size);
+		reg_save_area_offset = -(ctx->stack_frame_size - ctx->locals_area_size);
 		overflow_arg_area_offset = sizeof(void*) * 2 + ctx->param_stack_size;
 	} else {
 		fp = IR_REG_STACK_POINTER;
@@ -8588,11 +8772,11 @@ static void ir_emit_va_arg(ir_ctx *ctx, ir_ref def, ir_insn *insn)
 		}
 		|	add Ra(tmp_reg), IR_MAX(ir_type_size[type], sizeof(void*))
 	} else {
-		int size = (uint32_t)insn->op3 >> 3;
+		int size = IR_VA_ARG_SIZE(insn->op3);

 		if (def_reg != IR_REG_NONE) {
 			IR_ASSERT(type == IR_ADDR);
-			int align = 1U << (insn->op3 & 0x7);
+			int align = IR_VA_ARG_ALIGN(insn->op3);

 			if (align > (int)sizeof(void*)) {
 				|	add Ra(tmp_reg), (align-1)
@@ -8604,7 +8788,7 @@ static void ir_emit_va_arg(ir_ctx *ctx, ir_ref def, ir_insn *insn)
 	}
 #endif
 	|	mov aword [Ra(op2_reg)+offset], Ra(tmp_reg)
-	if (def_reg && IR_REG_SPILLED(ctx->regs[def][0])) {
+	if (def_reg != IR_REG_NONE && IR_REG_SPILLED(ctx->regs[def][0])) {
 		ir_emit_store(ctx, type, def, def_reg);
 	}
 #elif defined(IR_TARGET_X64)
@@ -8632,8 +8816,8 @@ static void ir_emit_va_arg(ir_ctx *ctx, ir_ref def, ir_insn *insn)
 	if (insn->op3) {
 		/* long struct arguemnt */
 		IR_ASSERT(type == IR_ADDR);
-		int align = 1U << (insn->op3 & 0x7);
-		int size = (uint32_t)insn->op3 >> 3;
+		int align = IR_VA_ARG_ALIGN(insn->op3);
+		int size = IR_VA_ARG_SIZE(insn->op3);

 		|	mov Ra(tmp_reg), aword [Ra(op2_reg)+(offset+offsetof(ir_va_list, overflow_arg_area))]
 		if (align > (int)sizeof(void*)) {
@@ -9701,6 +9885,19 @@ static void ir_emit_ijmp(ir_ctx *ctx, ir_ref def, ir_insn *insn)
 	ir_reg op2_reg = ctx->regs[def][2];

 	if (IR_IS_CONST_REF(insn->op2)) {
+		if (ctx->ir_base[insn->op2].op == IR_LABEL) {
+			if (!data->resolved_label_syms) {
+				data->resolved_label_syms = 1;
+				ir_resolve_label_syms(ctx);
+			}
+
+			uint32_t target = ctx->ir_base[insn->op2].val.u32_hi;
+			target = ir_skip_empty_target_blocks(ctx, target);
+
+			|	jmp =>target
+			return;
+		}
+
 		void *addr = ir_jmp_addr(ctx, insn, &ctx->ir_base[insn->op2]);

 		if (sizeof(void*) == 4 || IR_MAY_USE_32BIT_ADDR(ctx->code_buffer, addr)) {
@@ -10478,6 +10675,7 @@ static void ir_emit_param_move(ir_ctx *ctx, uint8_t type, ir_reg from_reg, ir_re
 {
 	ir_reg fp = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;

+	offset = IR_SPILL_POS_TO_OFFSET(offset);
 	IR_ASSERT(from_reg != IR_REG_NONE || to_reg != IR_REG_NONE);

 	if (IR_IS_TYPE_INT(type)) {
@@ -10518,6 +10716,7 @@ static void ir_emit_load_params(ir_ctx *ctx)
 	const int8_t *int_reg_params = _ir_int_reg_params;
 	const int8_t *fp_reg_params = _ir_fp_reg_params;
 	int32_t stack_offset = 0;
+	int32_t stack_start = 0;

 #ifdef IR_TARGET_X86
 	if (sizeof(void*) == 4 && (ctx->flags & IR_FASTCALL_FUNC)) {
@@ -10529,9 +10728,11 @@ static void ir_emit_load_params(ir_ctx *ctx)
 #endif

 	if (ctx->flags & IR_USE_FRAME_POINTER) {
-		stack_offset = sizeof(void*) * 2; /* skip old frame pointer and return address */
+		/* skip old frame pointer and return address */
+		stack_start = sizeof(void*) * 2 + ctx->stack_frame_size;
 	} else {
-		stack_offset = sizeof(void*) + ctx->stack_frame_size + ctx->call_stack_size; /* skip return address */
+		 /* skip return address */
+		stack_start = sizeof(void*) + ctx->stack_frame_size;
 	}
 	n = use_list->count;
 	for (i = 0, p = &ctx->use_edges[use_list->refs]; i < n; i++, p++) {
@@ -10573,12 +10774,9 @@ static void ir_emit_load_params(ir_ctx *ctx)
 			if (ctx->vregs[use]) {
 				dst_reg = IR_REG_NUM(ctx->regs[use][0]);
 				IR_ASSERT(src_reg != IR_REG_NONE || dst_reg != IR_REG_NONE ||
-					stack_offset == ctx->live_intervals[ctx->vregs[use]]->stack_spill_pos +
-						((ctx->flags & IR_USE_FRAME_POINTER) ?
-							-(ctx->stack_frame_size - ctx->stack_frame_alignment) :
-							ctx->call_stack_size));
+					stack_start + stack_offset == ctx->live_intervals[ctx->vregs[use]]->stack_spill_pos);
 				if (src_reg != dst_reg) {
-					ir_emit_param_move(ctx, insn->type, src_reg, dst_reg, use, stack_offset);
+					ir_emit_param_move(ctx, insn->type, src_reg, dst_reg, use, stack_start + stack_offset);
 				}
 				if (dst_reg != IR_REG_NONE && IR_REG_SPILLED(ctx->regs[use][0])) {
 					ir_emit_store(ctx, insn->type, use, dst_reg);
@@ -10665,7 +10863,7 @@ static void ir_fix_param_spills(ir_ctx *ctx)

 	if (ctx->flags & IR_USE_FRAME_POINTER) {
 		/* skip old frame pointer and return address */
-		stack_start = sizeof(void*) * 2 + (ctx->stack_frame_size - ctx->stack_frame_alignment);
+		stack_start = sizeof(void*) * 2 + ctx->stack_frame_size;
 	} else {
 		 /* skip return address */
 		stack_start = sizeof(void*) + ctx->stack_frame_size;
@@ -10786,6 +10984,7 @@ static void ir_allocate_unique_spill_slots(ir_ctx *ctx)
 				case IR_MERGE:
 				case IR_LOOP_BEGIN:
 				case IR_LOOP_END:
+				case IR_IGOTO_DUP:
 					break;
 #ifndef IR_REG_FP_RET1
 				case IR_CALL:
@@ -10810,7 +11009,7 @@ static void ir_allocate_unique_spill_slots(ir_ctx *ctx)
 							IR_REGSET_EXCL(available, reg);
 							ctx->regs[i][0] = reg | IR_REG_SPILL_STORE;
 						} else if (def_flags & IR_USE_MUST_BE_IN_REG) {
-							if (insn->op == IR_VLOAD
+							if ((insn->op == IR_VLOAD || insn->op == IR_VLOAD_v)
 							 && ctx->live_intervals[ctx->vregs[i]]
 							 && ctx->live_intervals[ctx->vregs[i]]->stack_spill_pos != -1
 							 && ir_is_same_mem_var(ctx, i, ctx->ir_base[insn->op2].op3)) {
@@ -10850,7 +11049,7 @@ static void ir_allocate_unique_spill_slots(ir_ctx *ctx)
 							for (i = 0, p = &ctx->use_edges[use_list->refs]; i < n; i++, p++) {
 								use = *p;
 								use_insn = &ctx->ir_base[use];
-								if (use_insn->op == IR_VLOAD) {
+								if (use_insn->op == IR_VLOAD || use_insn->op == IR_VLOAD_v) {
 									if (ctx->vregs[use]
 									 && !ctx->live_intervals[ctx->vregs[use]]) {
 										ir_live_interval *ival = ir_arena_alloc(&ctx->arena, sizeof(ir_live_interval));
@@ -10861,7 +11060,7 @@ static void ir_allocate_unique_spill_slots(ir_ctx *ctx)
 										ival->vreg = ctx->vregs[use];
 										ival->stack_spill_pos = stack_spill_pos;
 									}
-								} else if (use_insn->op == IR_VSTORE) {
+								} else if (use_insn->op == IR_VSTORE || use_insn->op == IR_VSTORE_v) {
 									if (!IR_IS_CONST_REF(use_insn->op3)
 									 && ctx->vregs[use_insn->op3]
 									 && !ctx->live_intervals[ctx->vregs[use_insn->op3]]) {
@@ -11006,7 +11205,6 @@ void ir_fix_stack_frame(ir_ctx *ctx)

 	ctx->stack_frame_size = IR_ALIGNED_SIZE(ctx->stack_frame_size, sizeof(void*));
 	ctx->stack_frame_size += additional_size;
-	ctx->stack_frame_alignment = 0;
 	ctx->call_stack_size = 0;

 	if (ctx->flags2 & IR_16B_FRAME_ALIGNMENT) {
@@ -11014,12 +11212,10 @@ void ir_fix_stack_frame(ir_ctx *ctx)
 		if (!(ctx->flags & IR_FUNCTION)) {
 			while (IR_ALIGNED_SIZE(ctx->stack_frame_size, 16) != ctx->stack_frame_size) {
 				ctx->stack_frame_size += sizeof(void*);
-				ctx->stack_frame_alignment += sizeof(void*);
 			}
 		} else if (ctx->flags & IR_USE_FRAME_POINTER) {
 			while (IR_ALIGNED_SIZE(ctx->stack_frame_size + sizeof(void*) * 2, 16) != ctx->stack_frame_size + sizeof(void*) * 2) {
 				ctx->stack_frame_size += sizeof(void*);
-				ctx->stack_frame_alignment += sizeof(void*);
 			}
 		} else {
 			if (!(ctx->flags & IR_NO_STACK_COMBINE)) {
@@ -11028,7 +11224,6 @@ void ir_fix_stack_frame(ir_ctx *ctx)
 			while (IR_ALIGNED_SIZE(ctx->stack_frame_size + ctx->call_stack_size + sizeof(void*), 16) !=
 					ctx->stack_frame_size + ctx->call_stack_size + sizeof(void*)) {
 				ctx->stack_frame_size += sizeof(void*);
-				ctx->stack_frame_alignment += sizeof(void*);
 			}
 		}
 	}
@@ -11061,6 +11256,8 @@ void *ir_emit_code(ir_ctx *ctx, size_t *size_ptr)
 	int ret;
 	void *entry;
 	size_t size;
+	ir_ref igoto_dup_ref = IR_UNUSED;
+	uint32_t igoto_dup_block = 0;

 	data.ra_data.unused_slot_4 = 0;
 	data.ra_data.unused_slot_2 = 0;
@@ -11073,11 +11270,13 @@ void *ir_emit_code(ir_ctx *ctx, size_t *size_ptr)
 	data.double_abs_const = 0;
 	data.float_abs_const = 0;
 	data.double_zero_const = 0;
+	data.u2d_const = 0;
+	data.u2f_const = 0;
+	data.resolved_label_syms = 0;
 	ctx->data = &data;

 	if (!ctx->live_intervals) {
 		ctx->stack_frame_size = 0;
-		ctx->stack_frame_alignment = 0;
 		ctx->call_stack_size = 0;
 		ctx->used_preserved_regs = 0;
 		ir_allocate_unique_spill_slots(ctx);
@@ -11099,7 +11298,6 @@ void *ir_emit_code(ir_ctx *ctx, size_t *size_ptr)
 		}
 		ctx->stack_frame_size = ctx->fixed_stack_frame_size;
 		ctx->call_stack_size = ctx->fixed_call_stack_size;
-		ctx->stack_frame_alignment = 0;
 	}

 	Dst = &data.dasm_state;
@@ -11420,6 +11618,35 @@ void *ir_emit_code(ir_ctx *ctx, size_t *size_ptr)
 				case IR_TAILCALL:
 					ir_emit_tailcall(ctx, i, insn);
 					break;
+				case IR_IGOTO_DUP:
+					if (bb->flags & IR_BB_DESSA_MOVES) {
+						ir_emit_dessa_moves(ctx, b, bb);
+					}
+					IR_ASSERT(!igoto_dup_ref && !igoto_dup_block);
+					igoto_dup_ref = i;
+					igoto_dup_block = b;
+					b = ctx->cfg_edges[bb->successors];
+					bb = &ctx->cfg_blocks[b];
+					i = bb->start;
+					insn = &ctx->ir_base[i];
+					rule = &ctx->rules[i];
+					break;
+				case IR_IGOTO:
+					if ((ctx->ir_base[insn->op1].op == IR_MERGE || ctx->ir_base[insn->op1].op == IR_LOOP_BEGIN)
+					 && (ctx->rules[ctx->ir_base[insn->op1].op1] & IR_RULE_MASK) == IR_IGOTO_DUP
+					 && igoto_dup_ref) {
+						ir_emit_ijmp(ctx, i, insn);
+						b = igoto_dup_block;
+						bb = &ctx->cfg_blocks[b];
+						i = igoto_dup_ref;
+						insn = &ctx->ir_base[i];
+						rule = &ctx->rules[i];
+						igoto_dup_block= 0;
+						igoto_dup_ref = 0;
+						break;
+					}
+					IR_ASSERT(!igoto_dup_ref && !igoto_dup_block);
+					IR_FALLTHROUGH;
 				case IR_IJMP:
 					ir_emit_ijmp(ctx, i, insn);
 					break;
@@ -11449,6 +11676,7 @@ void *ir_emit_code(ir_ctx *ctx, size_t *size_ptr)
 					ir_emit_vaddr(ctx, i, insn);
 					break;
 				case IR_VLOAD:
+				case IR_VLOAD_v:
 					ir_emit_vload(ctx, i, insn);
 					break;
 				case IR_VSTORE_INT:
@@ -11691,6 +11919,28 @@ next_block:;
 		} while (i != 0);
 	}

+	if ((ctx->flags2 & IR_HAS_BLOCK_ADDR) && ctx->loader && ctx->loader->add_label) {
+		for (b = 1, bb = &ctx->cfg_blocks[b]; b <= ctx->cfg_blocks_count; bb++, b++) {
+			ir_insn *insn = &ctx->ir_base[bb->start];
+
+			if (insn->op == IR_BEGIN && insn->op2) {
+				IR_ASSERT(ctx->ir_base[insn->op2].op == IR_LABEL);
+				ctx->ir_base[insn->op2].val.u32_hi = 0;
+				ctx->loader->add_label(ctx->loader, ir_get_str(ctx, ctx->ir_base[insn->op2].val.str),
+					(char*)entry + dasm_getpclabel(&data.dasm_state, ir_skip_empty_target_blocks(ctx, b)));
+			}
+		}
+	} else if (data.resolved_label_syms) {
+		for (b = 1, bb = &ctx->cfg_blocks[b]; b <= ctx->cfg_blocks_count; bb++, b++) {
+			ir_insn *insn = &ctx->ir_base[bb->start];
+
+			if (insn->op == IR_BEGIN && insn->op2) {
+				IR_ASSERT(ctx->ir_base[insn->op2].op == IR_LABEL);
+				ctx->ir_base[insn->op2].val.u32_hi = 0;
+			}
+		}
+	}
+
 	dasm_free(&data.dasm_state);

 	ir_mem_flush(entry, size);