x86/alternatives: Make JMPs more robust

commit 48c7a2509f9e237d8465399d9cdfe487d3212a23 upstream. Up until now we had to pay attention to relative JMPs in alternatives about how their relative offset gets computed so that the jump target is still correct. Or, as it is the case for near CALLs (opcode e8), we still have to go and readjust the offset at patching time. What is more, the static_cpu_has_safe() facility had to forcefully generate 5-byte JMPs since we couldn't rely on the compiler to generate properly sized ones so we had to force the longest ones. Worse than that, sometimes it would generate a replacement JMP which is longer than the original one, thus overwriting the beginning of the next instruction at patching time. So, in order to alleviate all that and make using JMPs more straight-forward we go and pad the original instruction in an alternative block with NOPs at build time, should the replacement(s) be longer. This way, alternatives users shouldn't pay special attention so that original and replacement instruction sizes are fine but the assembler would simply add padding where needed and not do anything otherwise. As a second aspect, we go and recompute JMPs at patching time so that we can try to make 5-byte JMPs into two-byte ones if possible. If not, we still have to recompute the offsets as the replacement JMP gets put far away in the .altinstr_replacement section leading to a wrong offset if copied verbatim. For example, on a locally generated kernel image old insn VA: 0xffffffff810014bd, CPU feat: X86_FEATURE_ALWAYS, size: 2 __switch_to: ffffffff810014bd: eb 21 jmp ffffffff810014e0 repl insn: size: 5 ffffffff81d0b23c: e9 b1 62 2f ff jmpq ffffffff810014f2 gets corrected to a 2-byte JMP: apply_alternatives: feat: 3*32+21, old: (ffffffff810014bd, len: 2), repl: (ffffffff81d0b23c, len: 5) alt_insn: e9 b1 62 2f ff recompute_jumps: next_rip: ffffffff81d0b241, tgt_rip: ffffffff810014f2, new_displ: 0x00000033, ret len: 2 converted to: eb 33 90 90 90 and a 5-byte JMP: old insn VA: 0xffffffff81001516, CPU feat: X86_FEATURE_ALWAYS, size: 2 __switch_to: ffffffff81001516: eb 30 jmp ffffffff81001548 repl insn: size: 5 ffffffff81d0b241: e9 10 63 2f ff jmpq ffffffff81001556 gets shortened into a two-byte one: apply_alternatives: feat: 3*32+21, old: (ffffffff81001516, len: 2), repl: (ffffffff81d0b241, len: 5) alt_insn: e9 10 63 2f ff recompute_jumps: next_rip: ffffffff81d0b246, tgt_rip: ffffffff81001556, new_displ: 0x0000003e, ret len: 2 converted to: eb 3e 90 90 90 ... and so on. This leads to a net win of around 40ish replacements * 3 bytes savings =~ 120 bytes of I$ on an AMD guest which means some savings of precious instruction cache bandwidth. The padding to the shorter 2-byte JMPs are single-byte NOPs which on smart microarchitectures means discarding NOPs at decode time and thus freeing up execution bandwidth. Signed-off-by: Borislav Petkov <bp@suse.de> Cc: Hugh Dickins <hughd@google.com> Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
author: Borislav Petkov <bp@suse.de> 2015-01-05 13:48:41 +0100
committer: Ben Hutchings <ben@decadent.org.uk> 2018-01-09 00:35:15 +0000
commit: 8f6482cd4c3594ccc2394f69a4bfe08c1d6f26e9 (patch)
tree: ef75ee538c1469eed100dd5386e34fde209c665c
parent: 005db1f8ff3e424440cf4746a1af6cc3dbdf699a (diff)
3 files changed, 105 insertions, 19 deletions
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index a932a1fbe66c..24aa32127f72 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -458,13 +458,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
 static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
 {
 #ifdef CC_HAVE_ASM_GOTO
-/*
- * We need to spell the jumps to the compiler because, depending on the offset,
- * the replacement jump can be bigger than the original jump, and this we cannot
- * have. Thus, we force the jump to the widest, 4-byte, signed relative
- * offset even though the last would often fit in less bytes.
- */
-		asm_volatile_goto("1: .byte 0xe9\n .long %l[t_dynamic] - 2f\n"
+		asm_volatile_goto("1: jmp %l[t_dynamic]\n"
 			 "2:\n"
 			 ".skip -(((5f-4f) - (2b-1b)) > 0) * "
 			         "((5f-4f) - (2b-1b)),0x90\n"
@@ -478,7 +472,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
 			 " .byte 3b - 2b\n"		/* pad len */
 			 ".previous\n"
 			 ".section .altinstr_replacement,\"ax\"\n"
-			 "4: .byte 0xe9\n .long %l[t_no] - 2b\n"
+			 "4: jmp %l[t_no]\n"
 			 "5:\n"
 			 ".previous\n"
 			 ".section .altinstructions,\"a\"\n"
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index c99b0f13a90e..715af37bf008 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -58,6 +58,21 @@ do {									\
 		printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args);	\
 } while (0)
 
+#define DUMP_BYTES(buf, len, fmt, args...)				\
+do {									\
+	if (unlikely(debug_alternative)) {				\
+		int j;							\
+									\
+		if (!(len))						\
+			break;						\
+									\
+		printk(KERN_DEBUG fmt, ##args);				\
+		for (j = 0; j < (len) - 1; j++)				\
+			printk(KERN_CONT "%02hhx ", buf[j]);		\
+		printk(KERN_CONT "%02hhx\n", buf[j]);			\
+	}								\
+} while (0)
+
 /*
  * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
  * that correspond to that nop. Getting from one nop to the next, we
@@ -244,6 +259,71 @@ extern s32 __smp_locks[], __smp_locks_end[];
 void *text_poke_early(void *addr, const void *opcode, size_t len);
 
 /*
+ * Are we looking at a near JMP with a 1 or 4-byte displacement.
+ */
+static inline bool is_jmp(const u8 opcode)
+{
+	return opcode == 0xeb || opcode == 0xe9;
+}
+
+static void __init_or_module
+recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf)
+{
+	u8 *next_rip, *tgt_rip;
+	s32 n_dspl, o_dspl;
+	int repl_len;
+
+	if (a->replacementlen != 5)
+		return;
+
+	o_dspl = *(s32 *)(insnbuf + 1);
+
+	/* next_rip of the replacement JMP */
+	next_rip = repl_insn + a->replacementlen;
+	/* target rip of the replacement JMP */
+	tgt_rip  = next_rip + o_dspl;
+	n_dspl = tgt_rip - orig_insn;
+
+	DPRINTK("target RIP: %p, new_displ: 0x%x", tgt_rip, n_dspl);
+
+	if (tgt_rip - orig_insn >= 0) {
+		if (n_dspl - 2 <= 127)
+			goto two_byte_jmp;
+		else
+			goto five_byte_jmp;
+	/* negative offset */
+	} else {
+		if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
+			goto two_byte_jmp;
+		else
+			goto five_byte_jmp;
+	}
+
+two_byte_jmp:
+	n_dspl -= 2;
+
+	insnbuf[0] = 0xeb;
+	insnbuf[1] = (s8)n_dspl;
+	add_nops(insnbuf + 2, 3);
+
+	repl_len = 2;
+	goto done;
+
+five_byte_jmp:
+	n_dspl -= 5;
+
+	insnbuf[0] = 0xe9;
+	*(s32 *)&insnbuf[1] = n_dspl;
+
+	repl_len = 5;
+
+done:
+
+	DPRINTK("final displ: 0x%08x, JMP 0x%lx",
+		n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
+}
+
+/*
  * Replace instructions with better alternatives for this CPU type. This runs
  * before SMP is initialized to avoid SMP problems with self modifying code.
  * This implies that asymmetric systems where APs have less capabilities than
@@ -268,6 +348,8 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
 	 * order.
 	 */
 	for (a = start; a < end; a++) {
+		int insnbuf_sz = 0;
+
 		instr = (u8 *)&a->instr_offset + a->instr_offset;
 		replacement = (u8 *)&a->repl_offset + a->repl_offset;
 		BUG_ON(a->instrlen > sizeof(insnbuf));
@@ -281,24 +363,35 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
 			instr, a->instrlen,
 			replacement, a->replacementlen);
 
+		DUMP_BYTES(instr, a->instrlen, "%p: old_insn: ", instr);
+		DUMP_BYTES(replacement, a->replacementlen, "%p: rpl_insn: ", replacement);
+
 		memcpy(insnbuf, replacement, a->replacementlen);
+		insnbuf_sz = a->replacementlen;
 
 		/* 0xe8 is a relative jump; fix the offset. */
 		if (*insnbuf == 0xe8 && a->replacementlen == 5) {
 			*(s32 *)(insnbuf + 1) += replacement - instr;
-			DPRINTK("Fix CALL offset: 0x%x", *(s32 *)(insnbuf + 1));
+			DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
+				*(s32 *)(insnbuf + 1),
+				(unsigned long)instr + *(s32 *)(insnbuf + 1) + 5);
 		}
 
-		if (a->instrlen > a->replacementlen)
+		if (a->replacementlen && is_jmp(replacement[0]))
+			recompute_jump(a, instr, replacement, insnbuf);
+
+		if (a->instrlen > a->replacementlen) {
 			add_nops(insnbuf + a->replacementlen,
 				 a->instrlen - a->replacementlen);
+			insnbuf_sz += a->instrlen - a->replacementlen;
+		}
+		DUMP_BYTES(insnbuf, insnbuf_sz, "%p: final_insn: ", instr);
 
-		text_poke_early(instr, insnbuf, a->instrlen);
+		text_poke_early(instr, insnbuf, insnbuf_sz);
 	}
 }
 
 #ifdef CONFIG_SMP
-
 static void alternatives_smp_lock(const s32 *start, const s32 *end,
 				  u8 *text, u8 *text_end)
 {
@@ -449,7 +542,7 @@ int alternatives_text_reserved(void *start, void *end)
 
 	return 0;
 }
-#endif
+#endif /* CONFIG_SMP */
 
 #ifdef CONFIG_PARAVIRT
 void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 5027613d9660..fca1f3a32bf5 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -28,14 +28,13 @@
  */
 	.macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2
 0:
-	.byte 0xe9	/* 32bit jump */
-	.long \orig-1f	/* by default jump to orig */
+	jmp \orig
 1:
 	.section .altinstr_replacement,"ax"
-2:	.byte 0xe9			/* near jump with 32bit immediate */
-	.long \alt1-1b /* offset */   /* or alternatively to alt1 */
-3:	.byte 0xe9			/* near jump with 32bit immediate */
-	.long \alt2-1b /* offset */   /* or alternatively to alt2 */
+2:
+	jmp \alt1
+3:
+	jmp \alt2
 	.previous
 
 	.section .altinstructions,"a"
author	Borislav Petkov <bp@suse.de>	2015-01-05 13:48:41 +0100
committer	Ben Hutchings <ben@decadent.org.uk>	2018-01-09 00:35:15 +0000
commit	8f6482cd4c3594ccc2394f69a4bfe08c1d6f26e9 (patch)
tree	ef75ee538c1469eed100dd5386e34fde209c665c
parent	005db1f8ff3e424440cf4746a1af6cc3dbdf699a (diff)