50 files changed, 1841 insertions, 426 deletions
diff --git a/Documentation/admin-guide/perf/alibaba_pmu.rst b/Documentation/admin-guide/perf/alibaba_pmu.rst
new file mode 100644
index 000000000000..11de998bb480
--- /dev/null
+++ b/Documentation/admin-guide/perf/alibaba_pmu.rst
@@ -0,0 +1,100 @@
+=============================================================
+Alibaba's T-Head SoC Uncore Performance Monitoring Unit (PMU)
+=============================================================
+
+The Yitian 710, custom-built by Alibaba Group's chip development business,
+T-Head, implements uncore PMU for performance and functional debugging to
+facilitate system maintenance.
+
+DDR Sub-System Driveway (DRW) PMU Driver
+=========================================
+
+Yitian 710 employs eight DDR5/4 channels, four on each die. Each DDR5 channel
+is independent of others to service system memory requests. And one DDR5
+channel is split into two independent sub-channels. The DDR Sub-System Driveway
+implements separate PMUs for each sub-channel to monitor various performance
+metrics.
+
+The Driveway PMU devices are named as ali_drw_<sys_base_addr> with perf.
+For example, ali_drw_21000 and ali_drw_21080 are two PMU devices for two
+sub-channels of the same channel in die 0. And the PMU device of die 1 is
+prefixed with ali_drw_400XXXXX, e.g. ali_drw_40021000.
+
+Each sub-channel has 36 PMU counters in total, which is classified into
+four groups:
+
+- Group 0: PMU Cycle Counter. This group has one pair of counters
+  pmu_cycle_cnt_low and pmu_cycle_cnt_high, that is used as the cycle count
+  based on DDRC core clock.
+
+- Group 1: PMU Bandwidth Counters. This group has 8 counters that are used
+  to count the total access number of either the eight bank groups in a
+  selected rank, or four ranks separately in the first 4 counters. The base
+  transfer unit is 64B.
+
+- Group 2: PMU Retry Counters. This group has 10 counters, that intend to
+  count the total retry number of each type of uncorrectable error.
+
+- Group 3: PMU Common Counters. This group has 16 counters, that are used
+  to count the common events.
+
+For now, the Driveway PMU driver only uses counters in group 0 and group 3.
+
+The DDR Controller (DDRCTL) and DDR PHY combine to create a complete solution
+for connecting an SoC application bus to DDR memory devices. The DDRCTL
+receives transactions Host Interface (HIF) which is custom-defined by Synopsys.
+These transactions are queued internally and scheduled for access while
+satisfying the SDRAM protocol timing requirements, transaction priorities, and
+dependencies between the transactions. The DDRCTL in turn issues commands on
+the DDR PHY Interface (DFI) to the PHY module, which launches and captures data
+to and from the SDRAM. The driveway PMUs have hardware logic to gather
+statistics and performance logging signals on HIF, DFI, etc.
+
+By counting the READ, WRITE and RMW commands sent to the DDRC through the HIF
+interface, we could calculate the bandwidth. Example usage of counting memory
+data bandwidth::
+
+  perf stat \
+    -e ali_drw_21000/hif_wr/ \
+    -e ali_drw_21000/hif_rd/ \
+    -e ali_drw_21000/hif_rmw/ \
+    -e ali_drw_21000/cycle/ \
+    -e ali_drw_21080/hif_wr/ \
+    -e ali_drw_21080/hif_rd/ \
+    -e ali_drw_21080/hif_rmw/ \
+    -e ali_drw_21080/cycle/ \
+    -e ali_drw_23000/hif_wr/ \
+    -e ali_drw_23000/hif_rd/ \
+    -e ali_drw_23000/hif_rmw/ \
+    -e ali_drw_23000/cycle/ \
+    -e ali_drw_23080/hif_wr/ \
+    -e ali_drw_23080/hif_rd/ \
+    -e ali_drw_23080/hif_rmw/ \
+    -e ali_drw_23080/cycle/ \
+    -e ali_drw_25000/hif_wr/ \
+    -e ali_drw_25000/hif_rd/ \
+    -e ali_drw_25000/hif_rmw/ \
+    -e ali_drw_25000/cycle/ \
+    -e ali_drw_25080/hif_wr/ \
+    -e ali_drw_25080/hif_rd/ \
+    -e ali_drw_25080/hif_rmw/ \
+    -e ali_drw_25080/cycle/ \
+    -e ali_drw_27000/hif_wr/ \
+    -e ali_drw_27000/hif_rd/ \
+    -e ali_drw_27000/hif_rmw/ \
+    -e ali_drw_27000/cycle/ \
+    -e ali_drw_27080/hif_wr/ \
+    -e ali_drw_27080/hif_rd/ \
+    -e ali_drw_27080/hif_rmw/ \
+    -e ali_drw_27080/cycle/ -- sleep 10
+
+The average DRAM bandwidth can be calculated as follows:
+
+- Read Bandwidth =  perf_hif_rd * DDRC_WIDTH * DDRC_Freq / DDRC_Cycle
+- Write Bandwidth = (perf_hif_wr + perf_hif_rmw) * DDRC_WIDTH * DDRC_Freq / DDRC_Cycle
+
+Here, DDRC_WIDTH = 64 bytes.
+
+The current driver does not support sampling. So "perf record" is
+unsupported.  Also attach to a task is unsupported as the events are all
+uncore.
diff --git a/Documentation/admin-guide/perf/index.rst b/Documentation/admin-guide/perf/index.rst
index 9c9ece88ce53..793e1970bc05 100644
--- a/Documentation/admin-guide/perf/index.rst
+++ b/Documentation/admin-guide/perf/index.rst
@@ -18,3 +18,4 @@ Performance monitor support
    xgene-pmu
    arm_dsu_pmu
    thunderx2-pmu
+   alibaba_pmu
diff --git a/Documentation/arm64/elf_hwcaps.rst b/Documentation/arm64/elf_hwcaps.rst
index 311021f2e560..bb34287c8e01 100644
--- a/Documentation/arm64/elf_hwcaps.rst
+++ b/Documentation/arm64/elf_hwcaps.rst
@@ -272,6 +272,9 @@ HWCAP2_WFXT
 HWCAP2_EBF16
     Functionality implied by ID_AA64ISAR1_EL1.BF16 == 0b0010.
 
+HWCAP2_SVE_EBF16
+    Functionality implied by ID_AA64ZFR0_EL1.BF16 == 0b0010.
+
 4. Unused AT_HWCAP bits
 -----------------------
 
diff --git a/Documentation/arm64/silicon-errata.rst b/Documentation/arm64/silicon-errata.rst
index fda97b3fcf01..17d9fc5d14fb 100644
--- a/Documentation/arm64/silicon-errata.rst
+++ b/Documentation/arm64/silicon-errata.rst
@@ -110,6 +110,8 @@ stable kernels.
 +----------------+-----------------+-----------------+-----------------------------+
 | ARM            | Cortex-A510     | #2441009        | ARM64_ERRATUM_2441009       |
 +----------------+-----------------+-----------------+-----------------------------+
+| ARM            | Cortex-A510     | #2658417        | ARM64_ERRATUM_2658417       |
++----------------+-----------------+-----------------+-----------------------------+
 | ARM            | Cortex-A710     | #2119858        | ARM64_ERRATUM_2119858       |
 +----------------+-----------------+-----------------+-----------------------------+
 | ARM            | Cortex-A710     | #2054223        | ARM64_ERRATUM_2054223       |
diff --git a/Documentation/arm64/sme.rst b/Documentation/arm64/sme.rst
index 937147f58cc5..16d2db4c2e2e 100644
--- a/Documentation/arm64/sme.rst
+++ b/Documentation/arm64/sme.rst
@@ -331,6 +331,9 @@ The regset data starts with struct user_za_header, containing:
   been read if a PTRACE_GETREGSET of NT_ARM_ZA were executed for each thread
   when the coredump was generated.
 
+* The NT_ARM_TLS note will be extended to two registers, the second register
+  will contain TPIDR2_EL0 on systems that support SME and will be read as
+  zero with writes ignored otherwise.
 
 9.  System runtime configuration
 --------------------------------
diff --git a/Documentation/arm64/sve.rst b/Documentation/arm64/sve.rst
index 93c2c2990584..f338ee2df46d 100644
--- a/Documentation/arm64/sve.rst
+++ b/Documentation/arm64/sve.rst
@@ -111,7 +111,7 @@ the SVE instruction set architecture.
 
 * On syscall, V0..V31 are preserved (as without SVE).  Thus, bits [127:0] of
   Z0..Z31 are preserved.  All other bits of Z0..Z31, and all of P0..P15 and FFR
-  become unspecified on return from a syscall.
+  become zero on return from a syscall.
 
 * The SVE registers are not used to pass arguments to or receive results from
   any syscall.
@@ -452,6 +452,24 @@ The regset data starts with struct user_sve_header, containing:
 * Modifying the system default vector length does not affect the vector length
   of any existing process or thread that does not make an execve() call.
 
+10.  Perf extensions
+--------------------------------
+
+* The arm64 specific DWARF standard [5] added the VG (Vector Granule) register
+  at index 46. This register is used for DWARF unwinding when variable length
+  SVE registers are pushed onto the stack.
+
+* Its value is equivalent to the current SVE vector length (VL) in bits divided
+  by 64.
+
+* The value is included in Perf samples in the regs[46] field if
+  PERF_SAMPLE_REGS_USER is set and the sample_regs_user mask has bit 46 set.
+
+* The value is the current value at the time the sample was taken, and it can
+  change over time.
+
+* If the system doesn't support SVE when perf_event_open is called with these
+  settings, the event will fail to open.
 
 Appendix A.  SVE programmer's model (informative)
 =================================================
@@ -593,3 +611,5 @@ References
     http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055c/IHI0055C_beta_aapcs64.pdf
     http://infocenter.arm.com/help/topic/com.arm.doc.subset.swdev.abi/index.html
     Procedure Call Standard for the ARM 64-bit Architecture (AArch64)
+
+[5] https://github.com/ARM-software/abi-aa/blob/main/aadwarf64/aadwarf64.rst
diff --git a/MAINTAINERS b/MAINTAINERS
index 589517372408..58d0aefead54 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -749,6 +749,12 @@ S:	Supported
 F:	drivers/infiniband/hw/erdma
 F:	include/uapi/rdma/erdma-abi.h
 
+ALIBABA PMU DRIVER
+M:	Shuai Xue <xueshuai@linux.alibaba.com>
+S:	Supported
+F:	Documentation/admin-guide/perf/alibaba_pmu.rst
+F:	drivers/perf/alibaba_uncore_dwr_pmu.c
+
 ALIENWARE WMI DRIVER
 L:	Dell.Client.Kernel@dell.com
 S:	Maintained
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 9fb9fff08c94..526ab76cd233 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -733,6 +733,19 @@ config ARM64_ERRATUM_2077057
 
 	  If unsure, say Y.
 
+config ARM64_ERRATUM_2658417
+	bool "Cortex-A510: 2658417: remove BF16 support due to incorrect result"
+	default y
+	help
+	  This option adds the workaround for ARM Cortex-A510 erratum 2658417.
+	  Affected Cortex-A510 (r0p0 to r1p1) may produce the wrong result for
+	  BFMMLA or VMMLA instructions in rare circumstances when a pair of
+	  A510 CPUs are using shared neon hardware. As the sharing is not
+	  discoverable by the kernel, hide the BF16 HWCAP to indicate that
+	  user-space should not be using these instructions.
+
+	  If unsure, say Y.
+
 config ARM64_ERRATUM_2119858
 	bool "Cortex-A710/X2: 2119858: workaround TRBE overwriting trace data in FILL mode"
 	default y
diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index d5b2d2dd4904..d699933cab45 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -18,6 +18,7 @@ CONFIG_NUMA_BALANCING=y
 CONFIG_MEMCG=y
 CONFIG_BLK_CGROUP=y
 CONFIG_CGROUP_PIDS=y
+CONFIG_CGROUP_FREEZER=y
 CONFIG_CGROUP_HUGETLB=y
 CONFIG_CPUSETS=y
 CONFIG_CGROUP_DEVICE=y
@@ -102,6 +103,8 @@ CONFIG_ARM_SCMI_CPUFREQ=y
 CONFIG_ARM_TEGRA186_CPUFREQ=y
 CONFIG_QORIQ_CPUFREQ=y
 CONFIG_ACPI=y
+CONFIG_ACPI_HOTPLUG_MEMORY=y
+CONFIG_ACPI_HMAT=y
 CONFIG_ACPI_APEI=y
 CONFIG_ACPI_APEI_GHES=y
 CONFIG_ACPI_APEI_PCIEAER=y
@@ -126,6 +129,8 @@ CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
 # CONFIG_COMPAT_BRK is not set
+CONFIG_MEMORY_HOTPLUG=y
+CONFIG_MEMORY_HOTREMOVE=y
 CONFIG_KSM=y
 CONFIG_MEMORY_FAILURE=y
 CONFIG_TRANSPARENT_HUGEPAGE=y
@@ -139,12 +144,16 @@ CONFIG_IP_PNP_DHCP=y
 CONFIG_IP_PNP_BOOTP=y
 CONFIG_IPV6=m
 CONFIG_NETFILTER=y
+CONFIG_BRIDGE_NETFILTER=m
 CONFIG_NF_CONNTRACK=m
 CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NETFILTER_XT_MARK=m
 CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m
 CONFIG_NETFILTER_XT_TARGET_LOG=m
 CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m
 CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
+CONFIG_NETFILTER_XT_MATCH_IPVS=m
+CONFIG_IP_VS=m
 CONFIG_IP_NF_IPTABLES=m
 CONFIG_IP_NF_FILTER=m
 CONFIG_IP_NF_TARGET_REJECT=m
@@ -1229,8 +1238,14 @@ CONFIG_PHY_UNIPHIER_USB3=y
 CONFIG_PHY_TEGRA_XUSB=y
 CONFIG_PHY_AM654_SERDES=m
 CONFIG_PHY_J721E_WIZ=m
+CONFIG_ARM_CCI_PMU=m
+CONFIG_ARM_CCN=m
+CONFIG_ARM_CMN=m
 CONFIG_ARM_SMMU_V3_PMU=m
+CONFIG_ARM_DSU_PMU=m
 CONFIG_FSL_IMX8_DDR_PMU=m
+CONFIG_ARM_SPE_PMU=m
+CONFIG_ARM_DMC620_PMU=m
 CONFIG_QCOM_L2_PMU=y
 CONFIG_QCOM_L3_PMU=y
 CONFIG_HISI_PMU=y
@@ -1325,4 +1340,12 @@ CONFIG_DEBUG_FS=y
 # CONFIG_SCHED_DEBUG is not set
 # CONFIG_DEBUG_PREEMPT is not set
 # CONFIG_FTRACE is not set
+CONFIG_CORESIGHT=m
+CONFIG_CORESIGHT_LINK_AND_SINK_TMC=m
+CONFIG_CORESIGHT_CATU=m
+CONFIG_CORESIGHT_SINK_TPIU=m
+CONFIG_CORESIGHT_SINK_ETBV10=m
+CONFIG_CORESIGHT_STM=m
+CONFIG_CORESIGHT_CPU_DEBUG=m
+CONFIG_CORESIGHT_CTI=m
 CONFIG_MEMTEST=y
diff --git a/arch/arm64/include/asm/atomic_ll_sc.h b/arch/arm64/include/asm/atomic_ll_sc.h
index fe0db8d416fb..0890e4f568fb 100644
--- a/arch/arm64/include/asm/atomic_ll_sc.h
+++ b/arch/arm64/include/asm/atomic_ll_sc.h
@@ -12,19 +12,6 @@
 
 #include <linux/stringify.h>
 
-#ifdef CONFIG_ARM64_LSE_ATOMICS
-#define __LL_SC_FALLBACK(asm_ops)					\
-"	b	3f\n"							\
-"	.subsection	1\n"						\
-"3:\n"									\
-asm_ops "\n"								\
-"	b	4f\n"							\
-"	.previous\n"							\
-"4:\n"
-#else
-#define __LL_SC_FALLBACK(asm_ops) asm_ops
-#endif
-
 #ifndef CONFIG_CC_HAS_K_CONSTRAINT
 #define K
 #endif
@@ -36,38 +23,36 @@ asm_ops "\n"								\
  */
 
 #define ATOMIC_OP(op, asm_op, constraint)				\
-static inline void							\
+static __always_inline void						\
 __ll_sc_atomic_##op(int i, atomic_t *v)					\
 {									\
 	unsigned long tmp;						\
 	int result;							\
 									\
 	asm volatile("// atomic_" #op "\n"				\
-	__LL_SC_FALLBACK(						\
 	"	prfm	pstl1strm, %2\n"				\
 	"1:	ldxr	%w0, %2\n"					\
 	"	" #asm_op "	%w0, %w0, %w3\n"			\
 	"	stxr	%w1, %w0, %2\n"					\
-	"	cbnz	%w1, 1b\n")					\
+	"	cbnz	%w1, 1b\n"					\
 	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)		\
 	: __stringify(constraint) "r" (i));				\
 }
 
 #define ATOMIC_OP_RETURN(name, mb, acq, rel, cl, op, asm_op, constraint)\
-static inline int							\
+static __always_inline int						\
 __ll_sc_atomic_##op##_return##name(int i, atomic_t *v)			\
 {									\
 	unsigned long tmp;						\
 	int result;							\
 									\
 	asm volatile("// atomic_" #op "_return" #name "\n"		\
-	__LL_SC_FALLBACK(						\
 	"	prfm	pstl1strm, %2\n"				\
 	"1:	ld" #acq "xr	%w0, %2\n"				\
 	"	" #asm_op "	%w0, %w0, %w3\n"			\
 	"	st" #rel "xr	%w1, %w0, %2\n"				\
 	"	cbnz	%w1, 1b\n"					\
-	"	" #mb )							\
+	"	" #mb							\
 	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)		\
 	: __stringify(constraint) "r" (i)				\
 	: cl);								\
@@ -76,20 +61,19 @@ __ll_sc_atomic_##op##_return##name(int i, atomic_t *v)			\
 }
 
 #define ATOMIC_FETCH_OP(name, mb, acq, rel, cl, op, asm_op, constraint) \
-static inline int							\
+static __always_inline int						\
 __ll_sc_atomic_fetch_##op##name(int i, atomic_t *v)			\
 {									\
 	unsigned long tmp;						\
 	int val, result;						\
 									\
 	asm volatile("// atomic_fetch_" #op #name "\n"			\
-	__LL_SC_FALLBACK(						\
 	"	prfm	pstl1strm, %3\n"				\
 	"1:	ld" #acq "xr	%w0, %3\n"				\
 	"	" #asm_op "	%w1, %w0, %w4\n"			\
 	"	st" #rel "xr	%w2, %w1, %3\n"				\
 	"	cbnz	%w2, 1b\n"					\
-	"	" #mb )							\
+	"	" #mb							\
 	: "=&r" (result), "=&r" (val), "=&r" (tmp), "+Q" (v->counter)	\
 	: __stringify(constraint) "r" (i)				\
 	: cl);								\
@@ -135,38 +119,36 @@ ATOMIC_OPS(andnot, bic, )
 #undef ATOMIC_OP
 
 #define ATOMIC64_OP(op, asm_op, constraint)				\
-static inline void							\
+static __always_inline void						\
 __ll_sc_atomic64_##op(s64 i, atomic64_t *v)				\
 {									\
 	s64 result;							\
 	unsigned long tmp;						\
 									\
 	asm volatile("// atomic64_" #op "\n"				\
-	__LL_SC_FALLBACK(						\
 	"	prfm	pstl1strm, %2\n"				\
 	"1:	ldxr	%0, %2\n"					\
 	"	" #asm_op "	%0, %0, %3\n"				\
 	"	stxr	%w1, %0, %2\n"					\
-	"	cbnz	%w1, 1b")					\
+	"	cbnz	%w1, 1b"					\
 	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)		\
 	: __stringify(constraint) "r" (i));				\
 }
 
 #define ATOMIC64_OP_RETURN(name, mb, acq, rel, cl, op, asm_op, constraint)\
-static inline long							\
+static __always_inline long						\
 __ll_sc_atomic64_##op##_return##name(s64 i, atomic64_t *v)		\
 {									\
 	s64 result;							\
 	unsigned long tmp;						\
 									\
 	asm volatile("// atomic64_" #op "_return" #name "\n"		\
-	__LL_SC_FALLBACK(						\
 	"	prfm	pstl1strm, %2\n"				\
 	"1:	ld" #acq "xr	%0, %2\n"				\
 	"	" #asm_op "	%0, %0, %3\n"				\
 	"	st" #rel "xr	%w1, %0, %2\n"				\
 	"	cbnz	%w1, 1b\n"					\
-	"	" #mb )							\
+	"	" #mb							\
 	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)		\
 	: __stringify(constraint) "r" (i)				\
 	: cl);								\
@@ -175,20 +157,19 @@ __ll_sc_atomic64_##op##_return##name(s64 i, atomic64_t *v)		\
 }
 
 #define ATOMIC64_FETCH_OP(name, mb, acq, rel, cl, op, asm_op, constraint)\
-static inline long							\
+static __always_inline long						\
 __ll_sc_atomic64_fetch_##op##name(s64 i, atomic64_t *v)			\
 {									\
 	s64 result, val;						\
 	unsigned long tmp;						\
 									\
 	asm volatile("// atomic64_fetch_" #op #name "\n"		\
-	__LL_SC_FALLBACK(						\
 	"	prfm	pstl1strm, %3\n"				\
 	"1:	ld" #acq "xr	%0, %3\n"				\
 	"	" #asm_op "	%1, %0, %4\n"				\
 	"	st" #rel "xr	%w2, %1, %3\n"				\
 	"	cbnz	%w2, 1b\n"					\
-	"	" #mb )							\
+	"	" #mb							\
 	: "=&r" (result), "=&r" (val), "=&r" (tmp), "+Q" (v->counter)	\
 	: __stringify(constraint) "r" (i)				\
 	: cl);								\
@@ -233,14 +214,13 @@ ATOMIC64_OPS(andnot, bic, )
 #undef ATOMIC64_OP_RETURN
 #undef ATOMIC64_OP
 
-static inline s64
+static __always_inline s64
 __ll_sc_atomic64_dec_if_positive(atomic64_t *v)
 {
 	s64 result;
 	unsigned long tmp;
 
 	asm volatile("// atomic64_dec_if_positive\n"
-	__LL_SC_FALLBACK(
 	"	prfm	pstl1strm, %2\n"
 	"1:	ldxr	%0, %2\n"
 	"	subs	%0, %0, #1\n"
@@ -248,7 +228,7 @@ __ll_sc_atomic64_dec_if_positive(atomic64_t *v)
 	"	stlxr	%w1, %0, %2\n"
 	"	cbnz	%w1, 1b\n"
 	"	dmb	ish\n"
-	"2:")
+	"2:"
 	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
 	:
 	: "cc", "memory");
@@ -257,7 +237,7 @@ __ll_sc_atomic64_dec_if_positive(atomic64_t *v)
 }
 
 #define __CMPXCHG_CASE(w, sfx, name, sz, mb, acq, rel, cl, constraint)	\
-static inline u##sz							\
+static __always_inline u##sz						\
 __ll_sc__cmpxchg_case_##name##sz(volatile void *ptr,			\
 					 unsigned long old,		\
 					 u##sz new)			\
@@ -274,7 +254,6 @@ __ll_sc__cmpxchg_case_##name##sz(volatile void *ptr,			\
 		old = (u##sz)old;					\
 									\
 	asm volatile(							\
-	__LL_SC_FALLBACK(						\
 	"	prfm	pstl1strm, %[v]\n"				\
 	"1:	ld" #acq "xr" #sfx "\t%" #w "[oldval], %[v]\n"		\
 	"	eor	%" #w "[tmp], %" #w "[oldval], %" #w "[old]\n"	\
@@ -282,7 +261,7 @@ __ll_sc__cmpxchg_case_##name##sz(volatile void *ptr,			\
 	"	st" #rel "xr" #sfx "\t%w[tmp], %" #w "[new], %[v]\n"	\
 	"	cbnz	%w[tmp], 1b\n"					\
 	"	" #mb "\n"						\
-	"2:")								\
+	"2:"								\
 	: [tmp] "=&r" (tmp), [oldval] "=&r" (oldval),			\
 	  [v] "+Q" (*(u##sz *)ptr)					\
 	: [old] __stringify(constraint) "r" (old), [new] "r" (new)	\
@@ -316,7 +295,7 @@ __CMPXCHG_CASE( ,  ,  mb_, 64, dmb ish,  , l, "memory", L)
 #undef __CMPXCHG_CASE
 
 #define __CMPXCHG_DBL(name, mb, rel, cl)				\
-static inline long							\
+static __always_inline long						\
 __ll_sc__cmpxchg_double##name(unsigned long old1,			\
 				      unsigned long old2,		\
 				      unsigned long new1,		\
@@ -326,7 +305,6 @@ __ll_sc__cmpxchg_double##name(unsigned long old1,			\
 	unsigned long tmp, ret;						\
 									\
 	asm volatile("// __cmpxchg_double" #name "\n"			\
-	__LL_SC_FALLBACK(						\
 	"	prfm	pstl1strm, %2\n"				\
 	"1:	ldxp	%0, %1, %2\n"					\
 	"	eor	%0, %0, %3\n"					\
@@ -336,7 +314,7 @@ __ll_sc__cmpxchg_double##name(unsigned long old1,			\
 	"	st" #rel "xp	%w0, %5, %6, %2\n"			\
 	"	cbnz	%w0, 1b\n"					\
 	"	" #mb "\n"						\
-	"2:")								\
+	"2:"								\
 	: "=&r" (tmp), "=&r" (ret), "+Q" (*(unsigned long *)ptr)	\
 	: "r" (old1), "r" (old2), "r" (new1), "r" (new2)		\
 	: cl);								\
diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h
index 5d460f6b7675..52075e93de6c 100644
--- a/arch/arm64/include/asm/atomic_lse.h
+++ b/arch/arm64/include/asm/atomic_lse.h
@@ -11,7 +11,8 @@
 #define __ASM_ATOMIC_LSE_H
 
 #define ATOMIC_OP(op, asm_op)						\
-static inline void __lse_atomic_##op(int i, atomic_t *v)		\
+static __always_inline void						\
+__lse_atomic_##op(int i, atomic_t *v)					\
 {									\
 	asm volatile(							\
 	__LSE_PREAMBLE							\
@@ -25,7 +26,7 @@ ATOMIC_OP(or, stset)
 ATOMIC_OP(xor, steor)
 ATOMIC_OP(add, stadd)
 
-static inline void __lse_atomic_sub(int i, atomic_t *v)
+static __always_inline void __lse_atomic_sub(int i, atomic_t *v)
 {
 	__lse_atomic_add(-i, v);
 }
@@ -33,7 +34,8 @@ static inline void __lse_atomic_sub(int i, atomic_t *v)
 #undef ATOMIC_OP
 
 #define ATOMIC_FETCH_OP(name, mb, op, asm_op, cl...)			\
-static inline int __lse_atomic_fetch_##op##name(int i, atomic_t *v)	\
+static __always_inline int						\
+__lse_atomic_fetch_##op##name(int i, atomic_t *v)			\
 {									\
 	int old;							\
 									\
@@ -63,7 +65,8 @@ ATOMIC_FETCH_OPS(add, ldadd)
 #undef ATOMIC_FETCH_OPS
 
 #define ATOMIC_FETCH_OP_SUB(name)					\
-static inline int __lse_atomic_fetch_sub##name(int i, atomic_t *v)	\
+static __always_inline int						\
+__lse_atomic_fetch_sub##name(int i, atomic_t *v)			\
 {									\
 	return __lse_atomic_fetch_add##name(-i, v);			\
 }
@@ -76,12 +79,14 @@ ATOMIC_FETCH_OP_SUB(        )
 #undef ATOMIC_FETCH_OP_SUB
 
 #define ATOMIC_OP_ADD_SUB_RETURN(name)					\
-static inline int __lse_atomic_add_return##name(int i, atomic_t *v)	\
+static __always_inline int						\
+__lse_atomic_add_return##name(int i, atomic_t *v)			\
 {									\
 	return __lse_atomic_fetch_add##name(i, v) + i;			\
 }									\
 									\
-static inline int __lse_atomic_sub_return##name(int i, atomic_t *v)	\
+static __always_inline int						\
+__lse_atomic_sub_return##name(int i, atomic_t *v)			\
 {									\
 	return __lse_atomic_fetch_sub(i, v) - i;			\
 }
@@ -93,13 +98,14 @@ ATOMIC_OP_ADD_SUB_RETURN(        )
 
 #undef ATOMIC_OP_ADD_SUB_RETURN
 
-static inline void __lse_atomic_and(int i, atomic_t *v)
+static __always_inline void __lse_atomic_and(int i, atomic_t *v)
 {
 	return __lse_atomic_andnot(~i, v);
 }
 
 #define ATOMIC_FETCH_OP_AND(name, mb, cl...)				\
-static inline int __lse_atomic_fetch_and##name(int i, atomic_t *v)	\
+static __always_inline int						\
+__lse_atomic_fetch_and##name(int i, atomic_t *v)			\
 {									\
 	return __lse_atomic_fetch_andnot##name(~i, v);			\
 }
@@ -112,7 +118,8 @@ ATOMIC_FETCH_OP_AND(        , al, "memory")
 #undef ATOMIC_FETCH_OP_AND
 
 #define ATOMIC64_OP(op, asm_op)						\
-static inline void __lse_atomic64_##op(s64 i, atomic64_t *v)		\
+static __always_inline void						\
+__lse_atomic64_##op(s64 i, atomic64_t *v)				\
 {									\
 	asm volatile(							\
 	__LSE_PREAMBLE							\
@@ -126,7 +133,7 @@ ATOMIC64_OP(or, stset)
 ATOMIC64_OP(xor, steor)
 ATOMIC64_OP(add, stadd)
 
-static inline void __lse_atomic64_sub(s64 i, atomic64_t *v)
+static __always_inline void __lse_atomic64_sub(s64 i, atomic64_t *v)
 {
 	__lse_atomic64_add(-i, v);
 }
@@ -134,7 +141,8 @@ static inline void __lse_atomic64_sub(s64 i, atomic64_t *v)
 #undef ATOMIC64_OP
 
 #define ATOMIC64_FETCH_OP(name, mb, op, asm_op, cl...)			\
-static inline long __lse_atomic64_fetch_##op##name(s64 i, atomic64_t *v)\
+static __always_inline long						\
+__lse_atomic64_fetch_##op##name(s64 i, atomic64_t *v)			\
 {									\
 	s64 old;							\
 									\
@@ -164,7 +172,8 @@ ATOMIC64_FETCH_OPS(add, ldadd)
 #undef ATOMIC64_FETCH_OPS
 
 #define ATOMIC64_FETCH_OP_SUB(name)					\
-static inline long __lse_atomic64_fetch_sub##name(s64 i, atomic64_t *v)	\
+static __always_inline long						\
+__lse_atomic64_fetch_sub##name(s64 i, atomic64_t *v)			\
 {									\
 	return __lse_atomic64_fetch_add##name(-i, v);			\
 }
@@ -177,12 +186,14 @@ ATOMIC64_FETCH_OP_SUB(        )
 #undef ATOMIC64_FETCH_OP_SUB
 
 #define ATOMIC64_OP_ADD_SUB_RETURN(name)				\
-static inline long __lse_atomic64_add_return##name(s64 i, atomic64_t *v)\
+static __always_inline long						\
+__lse_atomic64_add_return##name(s64 i, atomic64_t *v)			\
 {									\
 	return __lse_atomic64_fetch_add##name(i, v) + i;		\
 }									\
 									\
-static inline long __lse_atomic64_sub_return##name(s64 i, atomic64_t *v)\
+static __always_inline long						\
+__lse_atomic64_sub_return##name(s64 i, atomic64_t *v)			\
 {									\
 	return __lse_atomic64_fetch_sub##name(i, v) - i;		\
 }
@@ -194,13 +205,14 @@ ATOMIC64_OP_ADD_SUB_RETURN(        )
 
 #undef ATOMIC64_OP_ADD_SUB_RETURN
 
-static inline void __lse_atomic64_and(s64 i, atomic64_t *v)
+static __always_inline void __lse_atomic64_and(s64 i, atomic64_t *v)
 {
 	return __lse_atomic64_andnot(~i, v);
 }
 
 #define ATOMIC64_FETCH_OP_AND(name, mb, cl...)				\
-static inline long __lse_atomic64_fetch_and##name(s64 i, atomic64_t *v)	\
+static __always_inline long						\
+__lse_atomic64_fetch_and##name(s64 i, atomic64_t *v)			\
 {									\
 	return __lse_atomic64_fetch_andnot##name(~i, v);		\
 }
@@ -212,7 +224,7 @@ ATOMIC64_FETCH_OP_AND(        , al, "memory")
 
 #undef ATOMIC64_FETCH_OP_AND
 
-static inline s64 __lse_atomic64_dec_if_positive(atomic64_t *v)
+static __always_inline s64 __lse_atomic64_dec_if_positive(atomic64_t *v)
 {
 	unsigned long tmp;
 
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index ff06e6fb5939..a08672286b4c 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -907,6 +907,8 @@ static inline unsigned int get_vmid_bits(u64 mmfr1)
 	return 8;
 }
 
+struct arm64_ftr_reg *get_arm64_ftr_reg(u32 sys_id);
+
 extern struct arm64_ftr_override id_aa64mmfr1_override;
 extern struct arm64_ftr_override id_aa64pfr0_override;
 extern struct arm64_ftr_override id_aa64pfr1_override;
diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
index d94aecff9690..0278a58abe69 100644
--- a/arch/arm64/include/asm/exception.h
+++ b/arch/arm64/include/asm/exception.h
@@ -58,8 +58,9 @@ asmlinkage void call_on_irq_stack(struct pt_regs *regs,
 asmlinkage void asm_exit_to_user_mode(struct pt_regs *regs);
 
 void do_mem_abort(unsigned long far, unsigned long esr, struct pt_regs *regs);
-void do_undefinstr(struct pt_regs *regs);
-void do_bti(struct pt_regs *regs);
+void do_undefinstr(struct pt_regs *regs, unsigned long esr);
+void do_el0_bti(struct pt_regs *regs);
+void do_el1_bti(struct pt_regs *regs, unsigned long esr);
 void do_debug_exception(unsigned long addr_if_watchpoint, unsigned long esr,
 			struct pt_regs *regs);
 void do_fpsimd_acc(unsigned long esr, struct pt_regs *regs);
@@ -72,7 +73,8 @@ void bad_el0_sync(struct pt_regs *regs, int reason, unsigned long esr);
 void do_cp15instr(unsigned long esr, struct pt_regs *regs);
 void do_el0_svc(struct pt_regs *regs);
 void do_el0_svc_compat(struct pt_regs *regs);
-void do_ptrauth_fault(struct pt_regs *regs, unsigned long esr);
+void do_el0_fpac(struct pt_regs *regs, unsigned long esr);
+void do_el1_fpac(struct pt_regs *regs, unsigned long esr);
 void do_serror(struct pt_regs *regs, unsigned long esr);
 void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags);
 
diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h
index cef4ae7a3d8b..298b386d3ebe 100644
--- a/arch/arm64/include/asm/hwcap.h
+++ b/arch/arm64/include/asm/hwcap.h
@@ -119,6 +119,7 @@
 #define KERNEL_HWCAP_SME_FA64		__khwcap2_feature(SME_FA64)
 #define KERNEL_HWCAP_WFXT		__khwcap2_feature(WFXT)
 #define KERNEL_HWCAP_EBF16		__khwcap2_feature(EBF16)
+#define KERNEL_HWCAP_SVE_EBF16		__khwcap2_feature(SVE_EBF16)
 
 /*
  * This yields a mask that user programs can use to figure out what
diff --git a/arch/arm64/include/asm/module.h b/arch/arm64/include/asm/module.h
index 4e7fa2623896..18734fed3bdd 100644
--- a/arch/arm64/include/asm/module.h
+++ b/arch/arm64/include/asm/module.h
@@ -58,11 +58,20 @@ static inline bool is_forbidden_offset_for_adrp(void *place)
 }
 
 struct plt_entry get_plt_entry(u64 dst, void *pc);
-bool plt_entries_equal(const struct plt_entry *a, const struct plt_entry *b);
 
-static inline bool plt_entry_is_initialized(const struct plt_entry *e)
+static inline const Elf_Shdr *find_section(const Elf_Ehdr *hdr,
+				    const Elf_Shdr *sechdrs,
+				    const char *name)
 {
-	return e->adrp || e->add || e->br;
+	const Elf_Shdr *s, *se;
+	const char *secstrs = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
+
+	for (s = sechdrs, se = sechdrs + hdr->e_shnum; s < se; s++) {
+		if (strcmp(name, secstrs + s->sh_name) == 0)
+			return s;
+	}
+
+	return NULL;
 }
 
 #endif /* __ASM_MODULE_H */
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 86eb0bfe3b38..61883518fc50 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -410,7 +410,7 @@ long get_tagged_addr_ctrl(struct task_struct *task);
  * The top of the current task's task stack
  */
 #define current_top_of_stack()	((unsigned long)current->stack + THREAD_SIZE)
-#define on_thread_stack()	(on_task_stack(current, current_stack_pointer, 1, NULL))
+#define on_thread_stack()	(on_task_stack(current, current_stack_pointer, 1))
 
 #endif /* __ASSEMBLY__ */
 #endif /* __ASM_PROCESSOR_H */
diff --git a/arch/arm64/include/asm/sdei.h b/arch/arm64/include/asm/sdei.h
index 7bea1d705dd6..4292d9bafb9d 100644
--- a/arch/arm64/include/asm/sdei.h
+++ b/arch/arm64/include/asm/sdei.h
@@ -43,22 +43,5 @@ unsigned long do_sdei_event(struct pt_regs *regs,
 unsigned long sdei_arch_get_entry_point(int conduit);
 #define sdei_arch_get_entry_point(x)	sdei_arch_get_entry_point(x)
 
-struct stack_info;
-
-bool _on_sdei_stack(unsigned long sp, unsigned long size,
-		    struct stack_info *info);
-static inline bool on_sdei_stack(unsigned long sp, unsigned long size,
-				struct stack_info *info)
-{
-	if (!IS_ENABLED(CONFIG_VMAP_STACK))
-		return false;
-	if (!IS_ENABLED(CONFIG_ARM_SDE_INTERFACE))
-		return false;
-	if (in_nmi())
-		return _on_sdei_stack(sp, size, info);
-
-	return false;
-}
-
 #endif /* __ASSEMBLY__ */
 #endif	/* __ASM_SDEI_H */
diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h
index 6ebdcdff77f5..5a0edb064ea4 100644
--- a/arch/arm64/include/asm/stacktrace.h
+++ b/arch/arm64/include/asm/stacktrace.h
@@ -22,39 +22,86 @@ extern void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk,
 
 DECLARE_PER_CPU(unsigned long *, irq_stack_ptr);
 
-static inline bool on_irq_stack(unsigned long sp, unsigned long size,
-				struct stack_info *info)
+static inline struct stack_info stackinfo_get_irq(void)
 {
 	unsigned long low = (unsigned long)raw_cpu_read(irq_stack_ptr);
 	unsigned long high = low + IRQ_STACK_SIZE;
 
-	return on_stack(sp, size, low, high, STACK_TYPE_IRQ, info);
+	return (struct stack_info) {
+		.low = low,
+		.high = high,
+	};
 }
 
-static inline bool on_task_stack(const struct task_struct *tsk,
-				 unsigned long sp, unsigned long size,
-				 struct stack_info *info)
+static inline bool on_irq_stack(unsigned long sp, unsigned long size)
+{
+	struct stack_info info = stackinfo_get_irq();
+	return stackinfo_on_stack(&info, sp, size);
+}
+
+static inline struct stack_info stackinfo_get_task(const struct task_struct *tsk)
 {
 	unsigned long low = (unsigned long)task_stack_page(tsk);
 	unsigned long high = low + THREAD_SIZE;
 
-	return on_stack(sp, size, low, high, STACK_TYPE_TASK, info);
+	return (struct stack_info) {
+		.low = low,
+		.high = high,
+	};
+}
+
+static inline bool on_task_stack(const struct task_struct *tsk,
+				 unsigned long sp, unsigned long size)
+{
+	struct stack_info info = stackinfo_get_task(tsk);
+	return stackinfo_on_stack(&info, sp, size);
 }
 
 #ifdef CONFIG_VMAP_STACK
 DECLARE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack);
 
-static inline bool on_overflow_stack(unsigned long sp, unsigned long size,
-				struct stack_info *info)
+static inline struct stack_info stackinfo_get_overflow(void)
 {
 	unsigned long low = (unsigned long)raw_cpu_ptr(overflow_stack);
 	unsigned long high = low + OVERFLOW_STACK_SIZE;
 
-	return on_stack(sp, size, low, high, STACK_TYPE_OVERFLOW, info);
+	return (struct stack_info) {
+		.low = low,
+		.high = high,
+	};
+}
+#else
+#define stackinfo_get_overflow()	stackinfo_get_unknown()
+#endif
+
+#if defined(CONFIG_ARM_SDE_INTERFACE) && defined(CONFIG_VMAP_STACK)
+DECLARE_PER_CPU(unsigned long *, sdei_stack_normal_ptr);
+DECLARE_PER_CPU(unsigned long *, sdei_stack_critical_ptr);
+
+static inline struct stack_info stackinfo_get_sdei_normal(void)
+{
+	unsigned long low = (unsigned long)raw_cpu_read(sdei_stack_normal_ptr);
+	unsigned long high = low + SDEI_STACK_SIZE;
+
+	return (struct stack_info) {
+		.low = low,
+		.high = high,
+	};
+}
+
+static inline struct stack_info stackinfo_get_sdei_critical(void)
+{
+	unsigned long low = (unsigned long)raw_cpu_read(sdei_stack_critical_ptr);
+	unsigned long high = low + SDEI_STACK_SIZE;
+
+	return (struct stack_info) {
+		.low = low,
+		.high = high,
+	};
 }
 #else
-static inline bool on_overflow_stack(unsigned long sp, unsigned long size,
-			struct stack_info *info) { return false; }
+#define stackinfo_get_sdei_normal()	stackinfo_get_unknown()
+#define stackinfo_get_sdei_critical()	stackinfo_get_unknown()
 #endif
 
 #endif	/* __ASM_STACKTRACE_H */
diff --git a/arch/arm64/include/asm/stacktrace/common.h b/arch/arm64/include/asm/stacktrace/common.h
index f58eb944c46f..508f734de46e 100644
--- a/arch/arm64/include/asm/stacktrace/common.h
+++ b/arch/arm64/include/asm/stacktrace/common.h
@@ -2,13 +2,6 @@
 /*
  * Common arm64 stack unwinder code.
  *
- * To implement a new arm64 stack unwinder:
- *     1) Include this header
- *
- *     2) Call into unwind_next_common() from your top level unwind
- *        function, passing it the validation and translation callbacks
- *        (though the later can be NULL if no translation is required).
- *
  * See: arch/arm64/kernel/stacktrace.c for the reference implementation.
  *
  * Copyright (C) 2012 ARM Ltd.
@@ -16,78 +9,60 @@
 #ifndef __ASM_STACKTRACE_COMMON_H
 #define __ASM_STACKTRACE_COMMON_H
 
-#include <linux/bitmap.h>
-#include <linux/bitops.h>
 #include <linux/kprobes.h>
 #include <linux/types.h>
 
-enum stack_type {
-	STACK_TYPE_UNKNOWN,
-	STACK_TYPE_TASK,
-	STACK_TYPE_IRQ,
-	STACK_TYPE_OVERFLOW,
-	STACK_TYPE_SDEI_NORMAL,
-	STACK_TYPE_SDEI_CRITICAL,
-	STACK_TYPE_HYP,
-	__NR_STACK_TYPES
-};
-
 struct stack_info {
 	unsigned long low;
 	unsigned long high;
-	enum stack_type type;
 };
 
-/*
- * A snapshot of a frame record or fp/lr register values, along with some
- * accounting information necessary for robust unwinding.
+/**
+ * struct unwind_state - state used for robust unwinding.
  *
  * @fp:          The fp value in the frame record (or the real fp)
  * @pc:          The lr value in the frame record (or the real lr)
  *
- * @stacks_done: Stacks which have been entirely unwound, for which it is no
- *               longer valid to unwind to.
- *
- * @prev_fp:     The fp that pointed to this frame record, or a synthetic value
- *               of 0. This is used to ensure that within a stack, each
- *               subsequent frame record is at an increasing address.
- * @prev_type:   The type of stack this frame record was on, or a synthetic
- *               value of STACK_TYPE_UNKNOWN. This is used to detect a
- *               transition from one stack to another.
- *
  * @kr_cur:      When KRETPROBES is selected, holds the kretprobe instance
  *               associated with the most recently encountered replacement lr
  *               value.
  *
  * @task:        The task being unwound.
+ *
+ * @stack:       The stack currently being unwound.
+ * @stacks:      An array of stacks which can be unwound.
+ * @nr_stacks:   The number of stacks in @stacks.
  */
 struct unwind_state {
 	unsigned long fp;
 	unsigned long pc;
-	DECLARE_BITMAP(stacks_done, __NR_STACK_TYPES);
-	unsigned long prev_fp;
-	enum stack_type prev_type;
 #ifdef CONFIG_KRETPROBES
 	struct llist_node *kr_cur;
 #endif
 	struct task_struct *task;
+
+	struct stack_info stack;
+	struct stack_info *stacks;
+	int nr_stacks;
 };
 
-static inline bool on_stack(unsigned long sp, unsigned long size,
-			    unsigned long low, unsigned long high,
-			    enum stack_type type, struct stack_info *info)
+static inline struct stack_info stackinfo_get_unknown(void)
+{
+	return (struct stack_info) {
+		.low = 0,
+		.high = 0,
+	};
+}
+
+static inline bool stackinfo_on_stack(const struct stack_info *info,
+				      unsigned long sp, unsigned long size)
 {
-	if (!low)
+	if (!info->low)
 		return false;
 
-	if (sp < low || sp + size < sp || sp + size > high)
+	if (sp < info->low || sp + size < sp || sp + size > info->high)
 		return false;
 
-	if (info) {
-		info->low = low;
-		info->high = high;
-		info->type = type;
-	}
 	return true;
 }
 
@@ -99,99 +74,101 @@ static inline void unwind_init_common(struct unwind_state *state,
 	state->kr_cur = NULL;
 #endif
 
-	/*
-	 * Prime the first unwind.
-	 *
-	 * In unwind_next() we'll check that the FP points to a valid stack,
-	 * which can't be STACK_TYPE_UNKNOWN, and the first unwind will be
-	 * treated as a transition to whichever stack that happens to be. The
-	 * prev_fp value won't be used, but we set it to 0 such that it is
-	 * definitely not an accessible stack address.
-	 */
-	bitmap_zero(state->stacks_done, __NR_STACK_TYPES);
-	state->prev_fp = 0;
-	state->prev_type = STACK_TYPE_UNKNOWN;
+	state->stack = stackinfo_get_unknown();
 }
 
-/*
- * stack_trace_translate_fp_fn() - Translates a non-kernel frame pointer to
- * a kernel address.
- *
- * @fp:   the frame pointer to be updated to its kernel address.
- * @type: the stack type associated with frame pointer @fp
- *
- * Returns true and success and @fp is updated to the corresponding
- * kernel virtual address; otherwise returns false.
- */
-typedef bool (*stack_trace_translate_fp_fn)(unsigned long *fp,
-					    enum stack_type type);
+static struct stack_info *unwind_find_next_stack(const struct unwind_state *state,
+						 unsigned long sp,
+						 unsigned long size)
+{
+	for (int i = 0; i < state->nr_stacks; i++) {
+		struct stack_info *info = &state->stacks[i];
 
-/*
- * on_accessible_stack_fn() - Check whether a stack range is on any
- * of the possible stacks.
+		if (stackinfo_on_stack(info, sp, size))
+			return info;
+	}
+
+	return NULL;
+}
+
+/**
+ * unwind_consume_stack() - Check if an object is on an accessible stack,
+ * updating stack boundaries so that future unwind steps cannot consume this
+ * object again.
  *
- * @tsk:  task whose stack is being unwound
- * @sp:   stack address being checked
- * @size: size of the stack range being checked
- * @info: stack unwinding context
+ * @state: the current unwind state.
+ * @sp:    the base address of the object.
+ * @size:  the size of the object.
+ *
+ * Return: 0 upon success, an error code otherwise.
  */
-typedef bool (*on_accessible_stack_fn)(const struct task_struct *tsk,
-				       unsigned long sp, unsigned long size,
-				       struct stack_info *info);
-
-static inline int unwind_next_common(struct unwind_state *state,
-				     struct stack_info *info,
-				     on_accessible_stack_fn accessible,
-				     stack_trace_translate_fp_fn translate_fp)
+static inline int unwind_consume_stack(struct unwind_state *state,
+				       unsigned long sp,
+				       unsigned long size)
 {
-	unsigned long fp = state->fp, kern_fp = fp;
-	struct task_struct *tsk = state->task;
+	struct stack_info *next;
 
-	if (fp & 0x7)
-		return -EINVAL;
+	if (stackinfo_on_stack(&state->stack, sp, size))
+		goto found;
 
-	if (!accessible(tsk, fp, 16, info))
-		return -EINVAL;
-
-	if (test_bit(info->type, state->stacks_done))
+	next = unwind_find_next_stack(state, sp, size);
+	if (!next)
 		return -EINVAL;
 
 	/*
-	 * If fp is not from the current address space perform the necessary
-	 * translation before dereferencing it to get the next fp.
-	 */
-	if (translate_fp && !translate_fp(&kern_fp, info->type))
-		return -EINVAL;
-
-	/*
-	 * As stacks grow downward, any valid record on the same stack must be
-	 * at a strictly higher address than the prior record.
+	 * Stack transitions are strictly one-way, and once we've
+	 * transitioned from one stack to another, it's never valid to
+	 * unwind back to the old stack.
+	 *
+	 * Remove the current stack from the list of stacks so that it cannot
+	 * be found on a subsequent transition.
 	 *
-	 * Stacks can nest in several valid orders, e.g.
+	 * Note that stacks can nest in several valid orders, e.g.
 	 *
-	 * TASK -> IRQ -> OVERFLOW -> SDEI_NORMAL
-	 * TASK -> SDEI_NORMAL -> SDEI_CRITICAL -> OVERFLOW
-	 * HYP -> OVERFLOW
+	 *   TASK -> IRQ -> OVERFLOW -> SDEI_NORMAL
+	 *   TASK -> SDEI_NORMAL -> SDEI_CRITICAL -> OVERFLOW
+	 *   HYP -> OVERFLOW
 	 *
-	 * ... but the nesting itself is strict. Once we transition from one
-	 * stack to another, it's never valid to unwind back to that first
-	 * stack.
+	 * ... so we do not check the specific order of stack
+	 * transitions.
 	 */
-	if (info->type == state->prev_type) {
-		if (fp <= state->prev_fp)
-			return -EINVAL;
-	} else {
-		__set_bit(state->prev_type, state->stacks_done);
-	}
+	state->stack = *next;
+	*next = stackinfo_get_unknown();
+
+found:
+	/*
+	 * Future unwind steps can only consume stack above this frame record.
+	 * Update the current stack to start immediately above it.
+	 */
+	state->stack.low = sp + size;
+	return 0;
+}
+
+/**
+ * unwind_next_frame_record() - Unwind to the next frame record.
+ *
+ * @state:        the current unwind state.
+ *
+ * Return: 0 upon success, an error code otherwise.
+ */
+static inline int
+unwind_next_frame_record(struct unwind_state *state)
+{
+	unsigned long fp = state->fp;
+	int err;
+
+	if (fp & 0x7)
+		return -EINVAL;
+
+	err = unwind_consume_stack(state, fp, 16);
+	if (err)
+		return err;
 
 	/*
-	 * Record this frame record's values and location. The prev_fp and
-	 * prev_type are only meaningful to the next unwind_next() invocation.
+	 * Record this frame record's values.
 	 */
-	state->fp = READ_ONCE(*(unsigned long *)(kern_fp));
-	state->pc = READ_ONCE(*(unsigned long *)(kern_fp + 8));
-	state->prev_fp = fp;
-	state->prev_type = info->type;
+	state->fp = READ_ONCE(*(unsigned long *)(fp));
+	state->pc = READ_ONCE(*(unsigned long *)(fp + 8));
 
 	return 0;
 }
diff --git a/arch/arm64/include/asm/stacktrace/nvhe.h b/arch/arm64/include/asm/stacktrace/nvhe.h
index d5527b600390..25ab83a315a7 100644
--- a/arch/arm64/include/asm/stacktrace/nvhe.h
+++ b/arch/arm64/include/asm/stacktrace/nvhe.h
@@ -20,8 +20,8 @@
 
 #include <asm/stacktrace/common.h>
 
-/*
- * kvm_nvhe_unwind_init - Start an unwind from the given nVHE HYP fp and pc
+/**
+ * kvm_nvhe_unwind_init() - Start an unwind from the given nVHE HYP fp and pc
  *
  * @state : unwind_state to initialize
  * @fp    : frame pointer at which to start the unwinding.
diff --git a/arch/arm64/include/asm/system_misc.h b/arch/arm64/include/asm/system_misc.h
index 0eb7709422e2..c34344256762 100644
--- a/arch/arm64/include/asm/system_misc.h
+++ b/arch/arm64/include/asm/system_misc.h
@@ -18,7 +18,7 @@
 
 struct pt_regs;
 
-void die(const char *msg, struct pt_regs *regs, int err);
+void die(const char *msg, struct pt_regs *regs, long err);
 
 struct siginfo;
 void arm64_notify_die(const char *str, struct pt_regs *regs,
diff --git a/arch/arm64/include/asm/vdso.h b/arch/arm64/include/asm/vdso.h
index f99dcb94b438..b4ae32109932 100644
--- a/arch/arm64/include/asm/vdso.h
+++ b/arch/arm64/include/asm/vdso.h
@@ -26,6 +26,9 @@
 	(void *)(vdso_offset_##name - VDSO_LBASE + (unsigned long)(base)); \
 })
 
+extern char vdso_start[], vdso_end[];
+extern char vdso32_start[], vdso32_end[];
+
 #endif /* !__ASSEMBLY__ */
 
 #endif /* __ASM_VDSO_H */
diff --git a/arch/arm64/include/asm/vdso/gettimeofday.h b/arch/arm64/include/asm/vdso/gettimeofday.h
index 4f7a629df81f..764d13e2916c 100644
--- a/arch/arm64/include/asm/vdso/gettimeofday.h
+++ b/arch/arm64/include/asm/vdso/gettimeofday.h
@@ -7,8 +7,10 @@
 
 #ifndef __ASSEMBLY__
 
+#include <asm/alternative.h>
 #include <asm/barrier.h>
 #include <asm/unistd.h>
+#include <asm/sysreg.h>
 
 #define VDSO_HAS_CLOCK_GETRES		1
 
@@ -78,11 +80,20 @@ static __always_inline u64 __arch_get_hw_counter(s32 clock_mode,
 		return 0;
 
 	/*
-	 * This isb() is required to prevent that the counter value
+	 * If FEAT_ECV is available, use the self-synchronizing counter.
+	 * Otherwise the isb is required to prevent that the counter value
 	 * is speculated.
-	 */
-	isb();
-	asm volatile("mrs %0, cntvct_el0" : "=r" (res) :: "memory");
+	*/
+	asm volatile(
+	ALTERNATIVE("isb\n"
+		    "mrs %0, cntvct_el0",
+		    "nop\n"
+		    __mrs_s("%0", SYS_CNTVCTSS_EL0),
+		    ARM64_HAS_ECV)
+	: "=r" (res)
+	:
+	: "memory");
+
 	arch_counter_enforce_ordering(res);
 
 	return res;
diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h
index 1ad2568a2569..9b245da6f507 100644
--- a/arch/arm64/include/uapi/asm/hwcap.h
+++ b/arch/arm64/include/uapi/asm/hwcap.h
@@ -92,5 +92,6 @@
 #define HWCAP2_SME_FA64		(1 << 30)
 #define HWCAP2_WFXT		(1UL << 31)
 #define HWCAP2_EBF16		(1UL << 32)
+#define HWCAP2_SVE_EBF16	(1UL << 33)
 
 #endif /* _UAPI__ASM_HWCAP_H */
diff --git a/arch/arm64/include/uapi/asm/perf_regs.h b/arch/arm64/include/uapi/asm/perf_regs.h
index d54daafa89e3..86e556429e0e 100644
--- a/arch/arm64/include/uapi/asm/perf_regs.h
+++ b/arch/arm64/include/uapi/asm/perf_regs.h
@@ -37,5 +37,12 @@ enum perf_event_arm_regs {
 	PERF_REG_ARM64_SP,
 	PERF_REG_ARM64_PC,
 	PERF_REG_ARM64_MAX,
+
+	/* Extended/pseudo registers */
+	PERF_REG_ARM64_VG = 46,				/* SVE Vector Granule */
+	PERF_REG_ARM64_EXTENDED_MAX
 };
+
+#define PERF_REG_EXTENDED_MASK	(1ULL << PERF_REG_ARM64_VG)
+
 #endif /* _ASM_ARM64_PERF_REGS_H */
diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c
index 9bcaa5eacf16..a97775963f35 100644
--- a/arch/arm64/kernel/alternative.c
+++ b/arch/arm64/kernel/alternative.c
@@ -10,11 +10,14 @@
 
 #include <linux/init.h>
 #include <linux/cpu.h>
+#include <linux/elf.h>
 #include <asm/cacheflush.h>
 #include <asm/alternative.h>
 #include <asm/cpufeature.h>
 #include <asm/insn.h>
+#include <asm/module.h>
 #include <asm/sections.h>
+#include <asm/vdso.h>
 #include <linux/stop_machine.h>
 
 #define __ALT_PTR(a, f)		((void *)&(a)->f + (a)->f)
@@ -192,6 +195,30 @@ static void __nocfi __apply_alternatives(struct alt_region *region, bool is_modu
 	}
 }
 
+void apply_alternatives_vdso(void)
+{
+	struct alt_region region;
+	const struct elf64_hdr *hdr;
+	const struct elf64_shdr *shdr;
+	const struct elf64_shdr *alt;
+	DECLARE_BITMAP(all_capabilities, ARM64_NPATCHABLE);
+
+	bitmap_fill(all_capabilities, ARM64_NPATCHABLE);
+
+	hdr = (struct elf64_hdr *)vdso_start;
+	shdr = (void *)hdr + hdr->e_shoff;
+	alt = find_section(hdr, shdr, ".altinstructions");
+	if (!alt)
+		return;
+
+	region = (struct alt_region){
+		.begin	= (void *)hdr + alt->sh_offset,
+		.end	= (void *)hdr + alt->sh_offset + alt->sh_size,
+	};
+
+	__apply_alternatives(&region, false, &all_capabilities[0]);
+}
+
 /*
  * We might be patching the stop_machine state machine, so implement a
  * really simple polling protocol here.
@@ -225,6 +252,7 @@ static int __apply_alternatives_multi_stop(void *unused)
 
 void __init apply_alternatives_all(void)
 {
+	apply_alternatives_vdso();
 	/* better not try code patching on a live SMP system */
 	stop_machine(__apply_alternatives_multi_stop, NULL, cpu_online_mask);
 }
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index 53b973b6059f..58ca4f6b25d6 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -121,6 +121,22 @@ cpu_enable_cache_maint_trap(const struct arm64_cpu_capabilities *__unused)
 	sysreg_clear_set(sctlr_el1, SCTLR_EL1_UCI, 0);
 }
 
+static DEFINE_RAW_SPINLOCK(reg_user_mask_modification);
+static void __maybe_unused
+cpu_clear_bf16_from_user_emulation(const struct arm64_cpu_capabilities *__unused)
+{
+	struct arm64_ftr_reg *regp;
+
+	regp = get_arm64_ftr_reg(SYS_ID_AA64ISAR1_EL1);
+	if (!regp)
+		return;
+
+	raw_spin_lock(&reg_user_mask_modification);
+	if (regp->user_mask & ID_AA64ISAR1_EL1_BF16_MASK)
+		regp->user_mask &= ~ID_AA64ISAR1_EL1_BF16_MASK;
+	raw_spin_unlock(&reg_user_mask_modification);
+}
+
 #define CAP_MIDR_RANGE(model, v_min, r_min, v_max, r_max)	\
 	.matches = is_affected_midr_range,			\
 	.midr_range = MIDR_RANGE(model, v_min, r_min, v_max, r_max)
@@ -692,6 +708,16 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
 		.type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM,
 	},
 #endif
+#ifdef CONFIG_ARM64_ERRATUM_2658417
+	{
+		.desc = "ARM erratum 2658417",
+		.capability = ARM64_WORKAROUND_2658417,
+		/* Cortex-A510 r0p0 - r1p1 */
+		ERRATA_MIDR_RANGE(MIDR_CORTEX_A510, 0, 0, 1, 1),
+		MIDR_FIXED(MIDR_CPU_VAR_REV(1,1), BIT(25)),
+		.cpu_enable = cpu_clear_bf16_from_user_emulation,
+	},
+#endif
 	{
 	}
 };
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index c22732a6908b..bff9280e184f 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -750,7 +750,7 @@ static struct arm64_ftr_reg *get_arm64_ftr_reg_nowarn(u32 sys_id)
  * returns - Upon success,  matching ftr_reg entry for id.
  *         - NULL on failure but with an WARN_ON().
  */
-static struct arm64_ftr_reg *get_arm64_ftr_reg(u32 sys_id)
+struct arm64_ftr_reg *get_arm64_ftr_reg(u32 sys_id)
 {
 	struct arm64_ftr_reg *reg;
 
@@ -1401,17 +1401,40 @@ feature_matches(u64 reg, const struct arm64_cpu_capabilities *entry)
 	return val >= entry->min_field_value;
 }
 
-static bool
-has_cpuid_feature(const struct arm64_cpu_capabilities *entry, int scope)
+static u64
+read_scoped_sysreg(const struct arm64_cpu_capabilities *entry, int scope)
 {
-	u64 val;
-
 	WARN_ON(scope == SCOPE_LOCAL_CPU && preemptible());
 	if (scope == SCOPE_SYSTEM)
-		val = read_sanitised_ftr_reg(entry->sys_reg);
+		return read_sanitised_ftr_reg(entry->sys_reg);
 	else
-		val = __read_sysreg_by_encoding(entry->sys_reg);
+		return __read_sysreg_by_encoding(entry->sys_reg);
+}
+
+static bool
+has_user_cpuid_feature(const struct arm64_cpu_capabilities *entry, int scope)
+{
+	int mask;
+	struct arm64_ftr_reg *regp;
+	u64 val = read_scoped_sysreg(entry, scope);
+
+	regp = get_arm64_ftr_reg(entry->sys_reg);
+	if (!regp)
+		return false;
+
+	mask = cpuid_feature_extract_unsigned_field_width(regp->user_mask,
+							  entry->field_pos,
+							  entry->field_width);
+	if (!mask)
+		return false;
+
+	return feature_matches(val, entry);
+}
 
+static bool
+has_cpuid_feature(const struct arm64_cpu_capabilities *entry, int scope)
+{
+	u64 val = read_scoped_sysreg(entry, scope);
 	return feature_matches(val, entry);
 }
 
@@ -2624,7 +2647,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 };
 
 #define HWCAP_CPUID_MATCH(reg, field, width, s, min_value)			\
-		.matches = has_cpuid_feature,					\
+		.matches = has_user_cpuid_feature,					\
 		.sys_reg = reg,							\
 		.field_pos = field,						\
 		.field_width = width,						\
@@ -2733,6 +2756,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
 	HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_AES_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_AES_PMULL128, CAP_HWCAP, KERNEL_HWCAP_SVEPMULL),
 	HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_BitPerm_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_BitPerm_IMP, CAP_HWCAP, KERNEL_HWCAP_SVEBITPERM),
 	HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_BF16_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_BF16_IMP, CAP_HWCAP, KERNEL_HWCAP_SVEBF16),
+	HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_BF16_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_BF16_EBF16, CAP_HWCAP, KERNEL_HWCAP_SVE_EBF16),
 	HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_SHA3_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_SHA3_IMP, CAP_HWCAP, KERNEL_HWCAP_SVESHA3),
 	HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_SM4_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_SM4_IMP, CAP_HWCAP, KERNEL_HWCAP_SVESM4),
 	HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_I8MM_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_I8MM_IMP, CAP_HWCAP, KERNEL_HWCAP_SVEI8MM),
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index d7702f39b4d3..28d4f442b0bc 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -115,6 +115,7 @@ static const char *const hwcap_str[] = {
 	[KERNEL_HWCAP_SME_FA64]		= "smefa64",
 	[KERNEL_HWCAP_WFXT]		= "wfxt",
 	[KERNEL_HWCAP_EBF16]		= "ebf16",
+	[KERNEL_HWCAP_SVE_EBF16]	= "sveebf16",
 };
 
 #ifdef CONFIG_COMPAT
diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index c75ca36b4a49..9173fad279af 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -379,11 +379,20 @@ static void noinstr el1_pc(struct pt_regs *regs, unsigned long esr)
 	exit_to_kernel_mode(regs);
 }
 
-static void noinstr el1_undef(struct pt_regs *regs)
+static void noinstr el1_undef(struct pt_regs *regs, unsigned long esr)
 {
 	enter_from_kernel_mode(regs);
 	local_daif_inherit(regs);
-	do_undefinstr(regs);
+	do_undefinstr(regs, esr);
+	local_daif_mask();
+	exit_to_kernel_mode(regs);
+}
+
+static void noinstr el1_bti(struct pt_regs *regs, unsigned long esr)
+{
+	enter_from_kernel_mode(regs);
+	local_daif_inherit(regs);
+	do_el1_bti(regs, esr);
 	local_daif_mask();
 	exit_to_kernel_mode(regs);
 }
@@ -402,7 +411,7 @@ static void noinstr el1_fpac(struct pt_regs *regs, unsigned long esr)
 {
 	enter_from_kernel_mode(regs);
 	local_daif_inherit(regs);
-	do_ptrauth_fault(regs, esr);
+	do_el1_fpac(regs, esr);
 	local_daif_mask();
 	exit_to_kernel_mode(regs);
 }
@@ -425,7 +434,10 @@ asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs)
 		break;
 	case ESR_ELx_EC_SYS64:
 	case ESR_ELx_EC_UNKNOWN:
-		el1_undef(regs);
+		el1_undef(regs, esr);
+		break;
+	case ESR_ELx_EC_BTI:
+		el1_bti(regs, esr);
 		break;
 	case ESR_ELx_EC_BREAKPT_CUR:
 	case ESR_ELx_EC_SOFTSTP_CUR:
@@ -582,11 +594,11 @@ static void noinstr el0_sp(struct pt_regs *regs, unsigned long esr)
 	exit_to_user_mode(regs);
 }
 
-static void noinstr el0_undef(struct pt_regs *regs)
+static void noinstr el0_undef(struct pt_regs *regs, unsigned long esr)
 {
 	enter_from_user_mode(regs);
 	local_daif_restore(DAIF_PROCCTX);
-	do_undefinstr(regs);
+	do_undefinstr(regs, esr);
 	exit_to_user_mode(regs);
 }
 
@@ -594,7 +606,7 @@ static void noinstr el0_bti(struct pt_regs *regs)
 {
 	enter_from_user_mode(regs);
 	local_daif_restore(DAIF_PROCCTX);
-	do_bti(regs);
+	do_el0_bti(regs);
 	exit_to_user_mode(regs);
 }
 
@@ -629,7 +641,7 @@ static void noinstr el0_fpac(struct pt_regs *regs, unsigned long esr)
 {
 	enter_from_user_mode(regs);
 	local_daif_restore(DAIF_PROCCTX);
-	do_ptrauth_fault(regs, esr);
+	do_el0_fpac(regs, esr);
 	exit_to_user_mode(regs);
 }
 
@@ -670,7 +682,7 @@ asmlinkage void noinstr el0t_64_sync_handler(struct pt_regs *regs)
 		el0_pc(regs, esr);
 		break;
 	case ESR_ELx_EC_UNKNOWN:
-		el0_undef(regs);
+		el0_undef(regs, esr);
 		break;
 	case ESR_ELx_EC_BTI:
 		el0_bti(regs);
@@ -788,7 +800,7 @@ asmlinkage void noinstr el0t_32_sync_handler(struct pt_regs *regs)
 	case ESR_ELx_EC_CP14_MR:
 	case ESR_ELx_EC_CP14_LS:
 	case ESR_ELx_EC_CP14_64:
-		el0_undef(regs);
+		el0_undef(regs, esr);
 		break;
 	case ESR_ELx_EC_CP15_32:
 	case ESR_ELx_EC_CP15_64:
diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
index ea5dc7c90f46..b49ba9a24bcc 100644
--- a/arch/arm64/kernel/ftrace.c
+++ b/arch/arm64/kernel/ftrace.c
@@ -217,11 +217,26 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec,
 	unsigned long pc = rec->ip;
 	u32 old = 0, new;
 
+	new = aarch64_insn_gen_nop();
+
+	/*
+	 * When using mcount, callsites in modules may have been initalized to
+	 * call an arbitrary module PLT (which redirects to the _mcount stub)
+	 * rather than the ftrace PLT we'll use at runtime (which redirects to
+	 * the ftrace trampoline). We can ignore the old PLT when initializing
+	 * the callsite.
+	 *
+	 * Note: 'mod' is only set at module load time.
+	 */
+	if (!IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_REGS) &&
+	    IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) && mod) {
+		return aarch64_insn_patch_text_nosync((void *)pc, new);
+	}
+
 	if (!ftrace_find_callable_addr(rec, mod, &addr))
 		return -EINVAL;
 
 	old = aarch64_insn_gen_branch_imm(pc, addr, AARCH64_INSN_BRANCH_LINK);
-	new = aarch64_insn_gen_nop();
 
 	return ftrace_modify_code(pc, old, new, true);
 }
diff --git a/arch/arm64/kernel/module-plts.c b/arch/arm64/kernel/module-plts.c
index a3d0494f25a9..5a0a8f552a61 100644
--- a/arch/arm64/kernel/module-plts.c
+++ b/arch/arm64/kernel/module-plts.c
@@ -37,7 +37,8 @@ struct plt_entry get_plt_entry(u64 dst, void *pc)
 	return plt;
 }
 
-bool plt_entries_equal(const struct plt_entry *a, const struct plt_entry *b)
+static bool plt_entries_equal(const struct plt_entry *a,
+			      const struct plt_entry *b)
 {
 	u64 p, q;
 
diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index f2d4bb14bfab..76b41e4ca9fa 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -476,21 +476,6 @@ overflow:
 	return -ENOEXEC;
 }
 
-static const Elf_Shdr *find_section(const Elf_Ehdr *hdr,
-				    const Elf_Shdr *sechdrs,
-				    const char *name)
-{
-	const Elf_Shdr *s, *se;
-	const char *secstrs = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
-
-	for (s = sechdrs, se = sechdrs + hdr->e_shnum; s < se; s++) {
-		if (strcmp(name, secstrs + s->sh_name) == 0)
-			return s;
-	}
-
-	return NULL;
-}
-
 static inline void __init_plt(struct plt_entry *plt, unsigned long addr)
 {
 	*plt = get_plt_entry(addr, plt);
diff --git a/arch/arm64/kernel/perf_regs.c b/arch/arm64/kernel/perf_regs.c
index f6f58e6265df..b4eece3eb17d 100644
--- a/arch/arm64/kernel/perf_regs.c
+++ b/arch/arm64/kernel/perf_regs.c
@@ -9,9 +9,27 @@
 #include <asm/perf_regs.h>
 #include <asm/ptrace.h>
 
+static u64 perf_ext_regs_value(int idx)
+{
+	switch (idx) {
+	case PERF_REG_ARM64_VG:
+		if (WARN_ON_ONCE(!system_supports_sve()))
+			return 0;
+
+		/*
+		 * Vector granule is current length in bits of SVE registers
+		 * divided by 64.
+		 */
+		return (task_get_sve_vl(current) * 8) / 64;
+	default:
+		WARN_ON_ONCE(true);
+		return 0;
+	}
+}
+
 u64 perf_reg_value(struct pt_regs *regs, int idx)
 {
-	if (WARN_ON_ONCE((u32)idx >= PERF_REG_ARM64_MAX))
+	if (WARN_ON_ONCE((u32)idx >= PERF_REG_ARM64_EXTENDED_MAX))
 		return 0;
 
 	/*
@@ -51,6 +69,9 @@ u64 perf_reg_value(struct pt_regs *regs, int idx)
 	if ((u32)idx == PERF_REG_ARM64_PC)
 		return regs->pc;
 
+	if ((u32)idx >= PERF_REG_ARM64_MAX)
+		return perf_ext_regs_value(idx);
+
 	return regs->regs[idx];
 }
 
@@ -58,7 +79,12 @@ u64 perf_reg_value(struct pt_regs *regs, int idx)
 
 int perf_reg_validate(u64 mask)
 {
-	if (!mask || mask & REG_RESERVED)
+	u64 reserved_mask = REG_RESERVED;
+
+	if (system_supports_sve())
+		reserved_mask &= ~(1ULL << PERF_REG_ARM64_VG);
+
+	if (!mask || mask & reserved_mask)
 		return -EINVAL;
 
 	return 0;
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index eb7c08dfb834..13bfd84ce798 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -121,7 +121,7 @@ static bool regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr)
 {
 	return ((addr & ~(THREAD_SIZE - 1))  ==
 		(kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1))) ||
-		on_irq_stack(addr, sizeof(unsigned long), NULL);
+		on_irq_stack(addr, sizeof(unsigned long));
 }
 
 /**
@@ -666,10 +666,18 @@ static int fpr_set(struct task_struct *target, const struct user_regset *regset,
 static int tls_get(struct task_struct *target, const struct user_regset *regset,
 		   struct membuf to)
 {
+	int ret;
+
 	if (target == current)
 		tls_preserve_current_state();
 
-	return membuf_store(&to, target->thread.uw.tp_value);
+	ret = membuf_store(&to, target->thread.uw.tp_value);
+	if (system_supports_tpidr2())
+		ret = membuf_store(&to, target->thread.tpidr2_el0);
+	else
+		ret = membuf_zero(&to, sizeof(u64));
+
+	return ret;
 }
 
 static int tls_set(struct task_struct *target, const struct user_regset *regset,
@@ -677,13 +685,20 @@ static int tls_set(struct task_struct *target, const struct user_regset *regset,
 		   const void *kbuf, const void __user *ubuf)
 {
 	int ret;
-	unsigned long tls = target->thread.uw.tp_value;
+	unsigned long tls[2];
 
-	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &tls, 0, -1);
+	tls[0] = target->thread.uw.tp_value;
+	if (system_supports_sme())
+		tls[1] = target->thread.tpidr2_el0;
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, tls, 0, count);
 	if (ret)
 		return ret;
 
-	target->thread.uw.tp_value = tls;
+	target->thread.uw.tp_value = tls[0];
+	if (system_supports_sme())
+		target->thread.tpidr2_el0 = tls[1];
+
 	return ret;
 }
 
@@ -1392,7 +1407,7 @@ static const struct user_regset aarch64_regsets[] = {
 	},
 	[REGSET_TLS] = {
 		.core_note_type = NT_ARM_TLS,
-		.n = 1,
+		.n = 2,
 		.size = sizeof(void *),
 		.align = sizeof(void *),
 		.regset_get = tls_get,
diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c
index d20620a1c51a..d56e170e1ca7 100644
--- a/arch/arm64/kernel/sdei.c
+++ b/arch/arm64/kernel/sdei.c
@@ -162,38 +162,6 @@ static int init_sdei_scs(void)
 	return err;
 }
 
-static bool on_sdei_normal_stack(unsigned long sp, unsigned long size,
-				 struct stack_info *info)
-{
-	unsigned long low = (unsigned long)raw_cpu_read(sdei_stack_normal_ptr);
-	unsigned long high = low + SDEI_STACK_SIZE;
-
-	return on_stack(sp, size, low, high, STACK_TYPE_SDEI_NORMAL, info);
-}
-
-static bool on_sdei_critical_stack(unsigned long sp, unsigned long size,
-				   struct stack_info *info)
-{
-	unsigned long low = (unsigned long)raw_cpu_read(sdei_stack_critical_ptr);
-	unsigned long high = low + SDEI_STACK_SIZE;
-
-	return on_stack(sp, size, low, high, STACK_TYPE_SDEI_CRITICAL, info);
-}
-
-bool _on_sdei_stack(unsigned long sp, unsigned long size, struct stack_info *info)
-{
-	if (!IS_ENABLED(CONFIG_VMAP_STACK))
-		return false;
-
-	if (on_sdei_critical_stack(sp, size, info))
-		return true;
-
-	if (on_sdei_normal_stack(sp, size, info))
-		return true;
-
-	return false;
-}
-
 unsigned long sdei_arch_get_entry_point(int conduit)
 {
 	/*
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index ce190ee18a20..634279b3b03d 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -68,31 +68,6 @@ static inline void unwind_init_from_task(struct unwind_state *state,
 }
 
 /*
- * We can only safely access per-cpu stacks from current in a non-preemptible
- * context.
- */
-static bool on_accessible_stack(const struct task_struct *tsk,
-				unsigned long sp, unsigned long size,
-				struct stack_info *info)
-{
-	if (info)
-		info->type = STACK_TYPE_UNKNOWN;
-
-	if (on_task_stack(tsk, sp, size, info))
-		return true;
-	if (tsk != current || preemptible())
-		return false;
-	if (on_irq_stack(sp, size, info))
-		return true;
-	if (on_overflow_stack(sp, size, info))
-		return true;
-	if (on_sdei_stack(sp, size, info))
-		return true;
-
-	return false;
-}
-
-/*
  * Unwind from one frame record (A) to the next frame record (B).
  *
  * We terminate early if the location of B indicates a malformed chain of frame
@@ -103,14 +78,13 @@ static int notrace unwind_next(struct unwind_state *state)
 {
 	struct task_struct *tsk = state->task;
 	unsigned long fp = state->fp;
-	struct stack_info info;
 	int err;
 
 	/* Final frame; nothing to unwind */
 	if (fp == (unsigned long)task_pt_regs(tsk)->stackframe)
 		return -ENOENT;
 
-	err = unwind_next_common(state, &info, on_accessible_stack, NULL);
+	err = unwind_next_frame_record(state);
 	if (err)
 		return err;
 
@@ -190,11 +164,47 @@ void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl)
 	barrier();
 }
 
+/*
+ * Per-cpu stacks are only accessible when unwinding the current task in a
+ * non-preemptible context.
+ */
+#define STACKINFO_CPU(name)					\
+	({							\
+		((task == current) && !preemptible())		\
+			? stackinfo_get_##name()		\
+			: stackinfo_get_unknown();		\
+	})
+
+/*
+ * SDEI stacks are only accessible when unwinding the current task in an NMI
+ * context.
+ */
+#define STACKINFO_SDEI(name)					\
+	({							\
+		((task == current) && in_nmi())			\
+			? stackinfo_get_sdei_##name()		\
+			: stackinfo_get_unknown();		\
+	})
+
 noinline notrace void arch_stack_walk(stack_trace_consume_fn consume_entry,
 			      void *cookie, struct task_struct *task,
 			      struct pt_regs *regs)
 {
-	struct unwind_state state;
+	struct stack_info stacks[] = {
+		stackinfo_get_task(task),
+		STACKINFO_CPU(irq),
+#if defined(CONFIG_VMAP_STACK)
+		STACKINFO_CPU(overflow),
+#endif
+#if defined(CONFIG_VMAP_STACK) && defined(CONFIG_ARM_SDE_INTERFACE)
+		STACKINFO_SDEI(normal),
+		STACKINFO_SDEI(critical),
+#endif
+	};
+	struct unwind_state state = {
+		.stacks = stacks,
+		.nr_stacks = ARRAY_SIZE(stacks),
+	};
 
 	if (regs) {
 		if (task != current)
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index b7fed33981f7..54b5ba135b97 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -180,12 +180,12 @@ static void dump_kernel_instr(const char *lvl, struct pt_regs *regs)
 
 #define S_SMP " SMP"
 
-static int __die(const char *str, int err, struct pt_regs *regs)
+static int __die(const char *str, long err, struct pt_regs *regs)
 {
 	static int die_counter;
 	int ret;
 
-	pr_emerg("Internal error: %s: %x [#%d]" S_PREEMPT S_SMP "\n",
+	pr_emerg("Internal error: %s: %016lx [#%d]" S_PREEMPT S_SMP "\n",
 		 str, err, ++die_counter);
 
 	/* trap and error numbers are mostly meaningless on ARM */
@@ -206,7 +206,7 @@ static DEFINE_RAW_SPINLOCK(die_lock);
 /*
  * This function is protected against re-entrancy.
  */
-void die(const char *str, struct pt_regs *regs, int err)
+void die(const char *str, struct pt_regs *regs, long err)
 {
 	int ret;
 	unsigned long flags;
@@ -485,7 +485,7 @@ void arm64_notify_segfault(unsigned long addr)
 	force_signal_inject(SIGSEGV, code, addr, 0);
 }
 
-void do_undefinstr(struct pt_regs *regs)
+void do_undefinstr(struct pt_regs *regs, unsigned long esr)
 {
 	/* check for AArch32 breakpoint instructions */
 	if (!aarch32_break_handler(regs))
@@ -494,28 +494,38 @@ void do_undefinstr(struct pt_regs *regs)
 	if (call_undef_hook(regs) == 0)
 		return;
 
-	BUG_ON(!user_mode(regs));
+	if (!user_mode(regs))
+		die("Oops - Undefined instruction", regs, esr);
+
 	force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0);
 }
 NOKPROBE_SYMBOL(do_undefinstr);
 
-void do_bti(struct pt_regs *regs)
+void do_el0_bti(struct pt_regs *regs)
 {
-	BUG_ON(!user_mode(regs));
 	force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0);
 }
-NOKPROBE_SYMBOL(do_bti);
 
-void do_ptrauth_fault(struct pt_regs *regs, unsigned long esr)
+void do_el1_bti(struct pt_regs *regs, unsigned long esr)
+{
+	die("Oops - BTI", regs, esr);
+}
+NOKPROBE_SYMBOL(do_el1_bti);
+
+void do_el0_fpac(struct pt_regs *regs, unsigned long esr)
+{
+	force_signal_inject(SIGILL, ILL_ILLOPN, regs->pc, esr);
+}
+
+void do_el1_fpac(struct pt_regs *regs, unsigned long esr)
 {
 	/*
-	 * Unexpected FPAC exception or pointer authentication failure in
-	 * the kernel: kill the task before it does any more harm.
+	 * Unexpected FPAC exception in the kernel: kill the task before it
+	 * does any more harm.
 	 */
-	BUG_ON(!user_mode(regs));
-	force_signal_inject(SIGILL, ILL_ILLOPN, regs->pc, esr);
+	die("Oops - FPAC", regs, esr);
 }
-NOKPROBE_SYMBOL(do_ptrauth_fault);
+NOKPROBE_SYMBOL(do_el1_fpac)
 
 #define __user_cache_maint(insn, address, res)			\
 	if (address >= TASK_SIZE_MAX) {				\
@@ -758,7 +768,7 @@ void do_cp15instr(unsigned long esr, struct pt_regs *regs)
 		hook_base = cp15_64_hooks;
 		break;
 	default:
-		do_undefinstr(regs);
+		do_undefinstr(regs, esr);
 		return;
 	}
 
@@ -773,7 +783,7 @@ void do_cp15instr(unsigned long esr, struct pt_regs *regs)
 	 * EL0. Fall back to our usual undefined instruction handler
 	 * so that we handle these consistently.
 	 */
-	do_undefinstr(regs);
+	do_undefinstr(regs, esr);
 }
 NOKPROBE_SYMBOL(do_cp15instr);
 #endif
@@ -793,7 +803,7 @@ void do_sysinstr(unsigned long esr, struct pt_regs *regs)
 	 * back to our usual undefined instruction handler so that we handle
 	 * these consistently.
 	 */
-	do_undefinstr(regs);
+	do_undefinstr(regs, esr);
 }
 NOKPROBE_SYMBOL(do_sysinstr);
 
@@ -970,7 +980,7 @@ static int bug_handler(struct pt_regs *regs, unsigned long esr)
 {
 	switch (report_bug(regs->pc, regs)) {
 	case BUG_TRAP_TYPE_BUG:
-		die("Oops - BUG", regs, 0);
+		die("Oops - BUG", regs, esr);
 		break;
 
 	case BUG_TRAP_TYPE_WARN:
@@ -1038,7 +1048,7 @@ static int kasan_handler(struct pt_regs *regs, unsigned long esr)
 	 * This is something that might be fixed at some point in the future.
 	 */
 	if (!recover)
-		die("Oops - KASAN", regs, 0);
+		die("Oops - KASAN", regs, esr);
 
 	/* If thread survives, skip over the brk instruction and continue: */
 	arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE);
diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index a61fc4f989b3..ac93a2ee9c07 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -29,9 +29,6 @@
 #include <asm/signal32.h>
 #include <asm/vdso.h>
 
-extern char vdso_start[], vdso_end[];
-extern char vdso32_start[], vdso32_end[];
-
 enum vdso_abi {
 	VDSO_ABI_AA64,
 	VDSO_ABI_AA32,
diff --git a/arch/arm64/kernel/vdso/vdso.lds.S b/arch/arm64/kernel/vdso/vdso.lds.S
index e69fb4aaaf3e..6028f1fe2d1c 100644
--- a/arch/arm64/kernel/vdso/vdso.lds.S
+++ b/arch/arm64/kernel/vdso/vdso.lds.S
@@ -48,6 +48,13 @@ SECTIONS
 	PROVIDE (_etext = .);
 	PROVIDE (etext = .);
 
+	. = ALIGN(4);
+	.altinstructions : {
+		__alt_instructions = .;
+		*(.altinstructions)
+		__alt_instructions_end = .;
+	}
+
 	.dynamic	: { *(.dynamic) }		:text	:dynamic
 
 	.rela.dyn	: ALIGN(8) { *(.rela .rela*) }
diff --git a/arch/arm64/kvm/hyp/nvhe/stacktrace.c b/arch/arm64/kvm/hyp/nvhe/stacktrace.c
index 58f645ad66bc..ed6b58b19cfa 100644
--- a/arch/arm64/kvm/hyp/nvhe/stacktrace.c
+++ b/arch/arm64/kvm/hyp/nvhe/stacktrace.c
@@ -39,41 +39,32 @@ static void hyp_prepare_backtrace(unsigned long fp, unsigned long pc)
 
 DEFINE_PER_CPU(unsigned long [NVHE_STACKTRACE_SIZE/sizeof(long)], pkvm_stacktrace);
 
-static bool on_overflow_stack(unsigned long sp, unsigned long size,
-			      struct stack_info *info)
+static struct stack_info stackinfo_get_overflow(void)
 {
 	unsigned long low = (unsigned long)this_cpu_ptr(overflow_stack);
 	unsigned long high = low + OVERFLOW_STACK_SIZE;
 
-	return on_stack(sp, size, low, high, STACK_TYPE_OVERFLOW, info);
+	return (struct stack_info) {
+		.low = low,
+		.high = high,
+	};
 }
 
-static bool on_hyp_stack(unsigned long sp, unsigned long size,
-			      struct stack_info *info)
+static struct stack_info stackinfo_get_hyp(void)
 {
 	struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params);
 	unsigned long high = params->stack_hyp_va;
 	unsigned long low = high - PAGE_SIZE;
 
-	return on_stack(sp, size, low, high, STACK_TYPE_HYP, info);
-}
-
-static bool on_accessible_stack(const struct task_struct *tsk,
-				unsigned long sp, unsigned long size,
-				struct stack_info *info)
-{
-	if (info)
-		info->type = STACK_TYPE_UNKNOWN;
-
-	return (on_overflow_stack(sp, size, info) ||
-		on_hyp_stack(sp, size, info));
+	return (struct stack_info) {
+		.low = low,
+		.high = high,
+	};
 }
 
 static int unwind_next(struct unwind_state *state)
 {
-	struct stack_info info;
-
-	return unwind_next_common(state, &info, on_accessible_stack, NULL);
+	return unwind_next_frame_record(state);
 }
 
 static void notrace unwind(struct unwind_state *state,
@@ -129,7 +120,14 @@ static bool pkvm_save_backtrace_entry(void *arg, unsigned long where)
  */
 static void pkvm_save_backtrace(unsigned long fp, unsigned long pc)
 {
-	struct unwind_state state;
+	struct stack_info stacks[] = {
+		stackinfo_get_overflow(),
+		stackinfo_get_hyp(),
+	};
+	struct unwind_state state = {
+		.stacks = stacks,
+		.nr_stacks = ARRAY_SIZE(stacks),
+	};
 	int idx = 0;
 
 	kvm_nvhe_unwind_init(&state, fp, pc);
diff --git a/arch/arm64/kvm/stacktrace.c b/arch/arm64/kvm/stacktrace.c
index 949d19d603fb..3ace5b75813b 100644
--- a/arch/arm64/kvm/stacktrace.c
+++ b/arch/arm64/kvm/stacktrace.c
@@ -21,6 +21,54 @@
 
 #include <asm/stacktrace/nvhe.h>
 
+static struct stack_info stackinfo_get_overflow(void)
+{
+	struct kvm_nvhe_stacktrace_info *stacktrace_info
+				= this_cpu_ptr_nvhe_sym(kvm_stacktrace_info);
+	unsigned long low = (unsigned long)stacktrace_info->overflow_stack_base;
+	unsigned long high = low + OVERFLOW_STACK_SIZE;
+
+	return (struct stack_info) {
+		.low = low,
+		.high = high,
+	};
+}
+
+static struct stack_info stackinfo_get_overflow_kern_va(void)
+{
+	unsigned long low = (unsigned long)this_cpu_ptr_nvhe_sym(overflow_stack);
+	unsigned long high = low + OVERFLOW_STACK_SIZE;
+
+	return (struct stack_info) {
+		.low = low,
+		.high = high,
+	};
+}
+
+static struct stack_info stackinfo_get_hyp(void)
+{
+	struct kvm_nvhe_stacktrace_info *stacktrace_info
+				= this_cpu_ptr_nvhe_sym(kvm_stacktrace_info);
+	unsigned long low = (unsigned long)stacktrace_info->stack_base;
+	unsigned long high = low + PAGE_SIZE;
+
+	return (struct stack_info) {
+		.low = low,
+		.high = high,
+	};
+}
+
+static struct stack_info stackinfo_get_hyp_kern_va(void)
+{
+	unsigned long low = (unsigned long)*this_cpu_ptr(&kvm_arm_hyp_stack_page);
+	unsigned long high = low + PAGE_SIZE;
+
+	return (struct stack_info) {
+		.low = low,
+		.high = high,
+	};
+}
+
 /*
  * kvm_nvhe_stack_kern_va - Convert KVM nVHE HYP stack addresses to a kernel VAs
  *
@@ -34,73 +82,45 @@
  * Returns true on success and updates @addr to its corresponding kernel VA;
  * otherwise returns false.
  */
-static bool kvm_nvhe_stack_kern_va(unsigned long *addr,
-				   enum stack_type type)
+static bool kvm_nvhe_stack_kern_va(unsigned long *addr, unsigned long size)
 {
-	struct kvm_nvhe_stacktrace_info *stacktrace_info;
-	unsigned long hyp_base, kern_base, hyp_offset;
+	struct stack_info stack_hyp, stack_kern;
 
-	stacktrace_info = this_cpu_ptr_nvhe_sym(kvm_stacktrace_info);
+	stack_hyp = stackinfo_get_hyp();
+	stack_kern = stackinfo_get_hyp_kern_va();
+	if (stackinfo_on_stack(&stack_hyp, *addr, size))
+		goto found;
 
-	switch (type) {
-	case STACK_TYPE_HYP:
-		kern_base = (unsigned long)*this_cpu_ptr(&kvm_arm_hyp_stack_page);
-		hyp_base = (unsigned long)stacktrace_info->stack_base;
-		break;
-	case STACK_TYPE_OVERFLOW:
-		kern_base = (unsigned long)this_cpu_ptr_nvhe_sym(overflow_stack);
-		hyp_base = (unsigned long)stacktrace_info->overflow_stack_base;
-		break;
-	default:
-		return false;
-	}
+	stack_hyp = stackinfo_get_overflow();
+	stack_kern = stackinfo_get_overflow_kern_va();
+	if (stackinfo_on_stack(&stack_hyp, *addr, size))
+		goto found;
 
-	hyp_offset = *addr - hyp_base;
-
-	*addr = kern_base + hyp_offset;
+	return false;
 
+found:
+	*addr = *addr - stack_hyp.low + stack_kern.low;
 	return true;
 }
 
-static bool on_overflow_stack(unsigned long sp, unsigned long size,
-			      struct stack_info *info)
-{
-	struct kvm_nvhe_stacktrace_info *stacktrace_info
-				= this_cpu_ptr_nvhe_sym(kvm_stacktrace_info);
-	unsigned long low = (unsigned long)stacktrace_info->overflow_stack_base;
-	unsigned long high = low + OVERFLOW_STACK_SIZE;
-
-	return on_stack(sp, size, low, high, STACK_TYPE_OVERFLOW, info);
-}
-
-static bool on_hyp_stack(unsigned long sp, unsigned long size,
-			 struct stack_info *info)
-{
-	struct kvm_nvhe_stacktrace_info *stacktrace_info
-				= this_cpu_ptr_nvhe_sym(kvm_stacktrace_info);
-	unsigned long low = (unsigned long)stacktrace_info->stack_base;
-	unsigned long high = low + PAGE_SIZE;
-
-	return on_stack(sp, size, low, high, STACK_TYPE_HYP, info);
-}
-
-static bool on_accessible_stack(const struct task_struct *tsk,
-				unsigned long sp, unsigned long size,
-				struct stack_info *info)
+/*
+ * Convert a KVN nVHE HYP frame record address to a kernel VA
+ */
+static bool kvm_nvhe_stack_kern_record_va(unsigned long *addr)
 {
-	if (info)
-		info->type = STACK_TYPE_UNKNOWN;
-
-	return (on_overflow_stack(sp, size, info) ||
-		on_hyp_stack(sp, size, info));
+	return kvm_nvhe_stack_kern_va(addr, 16);
 }
 
 static int unwind_next(struct unwind_state *state)
 {
-	struct stack_info info;
-
-	return unwind_next_common(state, &info, on_accessible_stack,
-				  kvm_nvhe_stack_kern_va);
+	/*
+	 * The FP is in the hypervisor VA space. Convert it to the kernel VA
+	 * space so it can be unwound by the regular unwind functions.
+	 */
+	if (!kvm_nvhe_stack_kern_record_va(&state->fp))
+		return -EINVAL;
+
+	return unwind_next_frame_record(state);
 }
 
 static void unwind(struct unwind_state *state,
@@ -158,7 +178,14 @@ static void kvm_nvhe_dump_backtrace_end(void)
 static void hyp_dump_backtrace(unsigned long hyp_offset)
 {
 	struct kvm_nvhe_stacktrace_info *stacktrace_info;
-	struct unwind_state state;
+	struct stack_info stacks[] = {
+		stackinfo_get_overflow_kern_va(),
+		stackinfo_get_hyp_kern_va(),
+	};
+	struct unwind_state state = {
+		.stacks = stacks,
+		.nr_stacks = ARRAY_SIZE(stacks),
+	};
 
 	stacktrace_info = this_cpu_ptr_nvhe_sym(kvm_stacktrace_info);
 
diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
index 63b2484ce6c3..f553a7cb1b07 100644
--- a/arch/arm64/tools/cpucaps
+++ b/arch/arm64/tools/cpucaps
@@ -68,6 +68,7 @@ WORKAROUND_2038923
 WORKAROUND_2064142
 WORKAROUND_2077057
 WORKAROUND_2457168
+WORKAROUND_2658417
 WORKAROUND_TRBE_OVERWRITE_FILL_MODE
 WORKAROUND_TSB_FLUSH_FAILURE
 WORKAROUND_TRBE_WRITE_OUT_OF_RANGE
diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index 1e2d69453771..44c07ea487f4 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -183,6 +183,13 @@ config APPLE_M1_CPU_PMU
 	  Provides support for the non-architectural CPU PMUs present on
 	  the Apple M1 SoCs and derivatives.
 
+config ALIBABA_UNCORE_DRW_PMU
+	tristate "Alibaba T-Head Yitian 710 DDR Sub-system Driveway PMU driver"
+	depends on ARM64 || COMPILE_TEST
+	help
+	  Support for Driveway PMU events monitoring on Yitian 710 DDR
+	  Sub-system.
+
 source "drivers/perf/hisilicon/Kconfig"
 
 config MARVELL_CN10K_DDR_PMU
diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
index 57a279c61df5..050d04ee19dd 100644
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -20,3 +20,4 @@ obj-$(CONFIG_ARM_DMC620_PMU) += arm_dmc620_pmu.o
 obj-$(CONFIG_MARVELL_CN10K_TAD_PMU) += marvell_cn10k_tad_pmu.o
 obj-$(CONFIG_MARVELL_CN10K_DDR_PMU) += marvell_cn10k_ddr_pmu.o
 obj-$(CONFIG_APPLE_M1_CPU_PMU) += apple_m1_cpu_pmu.o
+obj-$(CONFIG_ALIBABA_UNCORE_DRW_PMU) += alibaba_uncore_drw_pmu.o
diff --git a/drivers/perf/alibaba_uncore_drw_pmu.c b/drivers/perf/alibaba_uncore_drw_pmu.c
new file mode 100644
index 000000000000..82729b874f09
--- /dev/null
+++ b/drivers/perf/alibaba_uncore_drw_pmu.c
@@ -0,0 +1,810 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Alibaba DDR Sub-System Driveway PMU driver
+ *
+ * Copyright (C) 2022 Alibaba Inc
+ */
+
+#define ALI_DRW_PMUNAME		"ali_drw"
+#define ALI_DRW_DRVNAME		ALI_DRW_PMUNAME "_pmu"
+#define pr_fmt(fmt)		ALI_DRW_DRVNAME ": " fmt
+
+#include <linux/acpi.h>
+#include <linux/bitfield.h>
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+#include <linux/cpuhotplug.h>
+#include <linux/cpumask.h>
+#include <linux/device.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/perf_event.h>
+#include <linux/platform_device.h>
+#include <linux/printk.h>
+#include <linux/rculist.h>
+#include <linux/refcount.h>
+
+
+#define ALI_DRW_PMU_COMMON_MAX_COUNTERS			16
+#define ALI_DRW_PMU_TEST_SEL_COMMON_COUNTER_BASE	19
+
+#define ALI_DRW_PMU_PA_SHIFT			12
+#define ALI_DRW_PMU_CNT_INIT			0x00000000
+#define ALI_DRW_CNT_MAX_PERIOD			0xffffffff
+#define ALI_DRW_PMU_CYCLE_EVT_ID		0x80
+
+#define ALI_DRW_PMU_CNT_CTRL			0xC00
+#define ALI_DRW_PMU_CNT_RST			BIT(2)
+#define ALI_DRW_PMU_CNT_STOP			BIT(1)
+#define ALI_DRW_PMU_CNT_START			BIT(0)
+
+#define ALI_DRW_PMU_CNT_STATE			0xC04
+#define ALI_DRW_PMU_TEST_CTRL			0xC08
+#define ALI_DRW_PMU_CNT_PRELOAD			0xC0C
+
+#define ALI_DRW_PMU_CYCLE_CNT_HIGH_MASK		GENMASK(23, 0)
+#define ALI_DRW_PMU_CYCLE_CNT_LOW_MASK		GENMASK(31, 0)
+#define ALI_DRW_PMU_CYCLE_CNT_HIGH		0xC10
+#define ALI_DRW_PMU_CYCLE_CNT_LOW		0xC14
+
+/* PMU EVENT SEL 0-3 are paired in 32-bit registers on a 4-byte stride */
+#define ALI_DRW_PMU_EVENT_SEL0			0xC68
+/* counter 0-3 use sel0, counter 4-7 use sel1...*/
+#define ALI_DRW_PMU_EVENT_SELn(n) \
+	(ALI_DRW_PMU_EVENT_SEL0 + (n / 4) * 0x4)
+#define ALI_DRW_PMCOM_CNT_EN			BIT(7)
+#define ALI_DRW_PMCOM_CNT_EVENT_MASK		GENMASK(5, 0)
+#define ALI_DRW_PMCOM_CNT_EVENT_OFFSET(n) \
+	(8 * (n % 4))
+
+/* PMU COMMON COUNTER 0-15, are paired in 32-bit registers on a 4-byte stride */
+#define ALI_DRW_PMU_COMMON_COUNTER0		0xC78
+#define ALI_DRW_PMU_COMMON_COUNTERn(n) \
+	(ALI_DRW_PMU_COMMON_COUNTER0 + 0x4 * (n))
+
+#define ALI_DRW_PMU_OV_INTR_ENABLE_CTL		0xCB8
+#define ALI_DRW_PMU_OV_INTR_DISABLE_CTL		0xCBC
+#define ALI_DRW_PMU_OV_INTR_ENABLE_STATUS	0xCC0
+#define ALI_DRW_PMU_OV_INTR_CLR			0xCC4
+#define ALI_DRW_PMU_OV_INTR_STATUS		0xCC8
+#define ALI_DRW_PMCOM_CNT_OV_INTR_MASK		GENMASK(23, 8)
+#define ALI_DRW_PMBW_CNT_OV_INTR_MASK		GENMASK(7, 0)
+#define ALI_DRW_PMU_OV_INTR_MASK		GENMASK_ULL(63, 0)
+
+static int ali_drw_cpuhp_state_num;
+
+static LIST_HEAD(ali_drw_pmu_irqs);
+static DEFINE_MUTEX(ali_drw_pmu_irqs_lock);
+
+struct ali_drw_pmu_irq {
+	struct hlist_node node;
+	struct list_head irqs_node;
+	struct list_head pmus_node;
+	int irq_num;
+	int cpu;
+	refcount_t refcount;
+};
+
+struct ali_drw_pmu {
+	void __iomem *cfg_base;
+	struct device *dev;
+
+	struct list_head pmus_node;
+	struct ali_drw_pmu_irq *irq;
+	int irq_num;
+	int cpu;
+	DECLARE_BITMAP(used_mask, ALI_DRW_PMU_COMMON_MAX_COUNTERS);
+	struct perf_event *events[ALI_DRW_PMU_COMMON_MAX_COUNTERS];
+	int evtids[ALI_DRW_PMU_COMMON_MAX_COUNTERS];
+
+	struct pmu pmu;
+};
+
+#define to_ali_drw_pmu(p) (container_of(p, struct ali_drw_pmu, pmu))
+
+#define DRW_CONFIG_EVENTID		GENMASK(7, 0)
+#define GET_DRW_EVENTID(event)	FIELD_GET(DRW_CONFIG_EVENTID, (event)->attr.config)
+
+static ssize_t ali_drw_pmu_format_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct dev_ext_attribute *eattr;
+
+	eattr = container_of(attr, struct dev_ext_attribute, attr);
+
+	return sprintf(buf, "%s\n", (char *)eattr->var);
+}
+
+/*
+ * PMU event attributes
+ */
+static ssize_t ali_drw_pmu_event_show(struct device *dev,
+			       struct device_attribute *attr, char *page)
+{
+	struct dev_ext_attribute *eattr;
+
+	eattr = container_of(attr, struct dev_ext_attribute, attr);
+
+	return sprintf(page, "config=0x%lx\n", (unsigned long)eattr->var);
+}
+
+#define ALI_DRW_PMU_ATTR(_name, _func, _config)                            \
+		(&((struct dev_ext_attribute[]) {                               \
+				{ __ATTR(_name, 0444, _func, NULL), (void *)_config }   \
+		})[0].attr.attr)
+
+#define ALI_DRW_PMU_FORMAT_ATTR(_name, _config)            \
+	ALI_DRW_PMU_ATTR(_name, ali_drw_pmu_format_show, (void *)_config)
+#define ALI_DRW_PMU_EVENT_ATTR(_name, _config)             \
+	ALI_DRW_PMU_ATTR(_name, ali_drw_pmu_event_show, (unsigned long)_config)
+
+static struct attribute *ali_drw_pmu_events_attrs[] = {
+	ALI_DRW_PMU_EVENT_ATTR(hif_rd_or_wr,			0x0),
+	ALI_DRW_PMU_EVENT_ATTR(hif_wr,				0x1),
+	ALI_DRW_PMU_EVENT_ATTR(hif_rd,				0x2),
+	ALI_DRW_PMU_EVENT_ATTR(hif_rmw,				0x3),
+	ALI_DRW_PMU_EVENT_ATTR(hif_hi_pri_rd,			0x4),
+	ALI_DRW_PMU_EVENT_ATTR(dfi_wr_data_cycles,		0x7),
+	ALI_DRW_PMU_EVENT_ATTR(dfi_rd_data_cycles,		0x8),
+	ALI_DRW_PMU_EVENT_ATTR(hpr_xact_when_critical,		0x9),
+	ALI_DRW_PMU_EVENT_ATTR(lpr_xact_when_critical,		0xA),
+	ALI_DRW_PMU_EVENT_ATTR(wr_xact_when_critical,		0xB),
+	ALI_DRW_PMU_EVENT_ATTR(op_is_activate,			0xC),
+	ALI_DRW_PMU_EVENT_ATTR(op_is_rd_or_wr,			0xD),
+	ALI_DRW_PMU_EVENT_ATTR(op_is_rd_activate,		0xE),
+	ALI_DRW_PMU_EVENT_ATTR(op_is_rd,			0xF),
+	ALI_DRW_PMU_EVENT_ATTR(op_is_wr,			0x10),
+	ALI_DRW_PMU_EVENT_ATTR(op_is_mwr,			0x11),
+	ALI_DRW_PMU_EVENT_ATTR(op_is_precharge,			0x12),
+	ALI_DRW_PMU_EVENT_ATTR(precharge_for_rdwr,		0x13),
+	ALI_DRW_PMU_EVENT_ATTR(precharge_for_other,		0x14),
+	ALI_DRW_PMU_EVENT_ATTR(rdwr_transitions,		0x15),
+	ALI_DRW_PMU_EVENT_ATTR(write_combine,			0x16),
+	ALI_DRW_PMU_EVENT_ATTR(war_hazard,			0x17),
+	ALI_DRW_PMU_EVENT_ATTR(raw_hazard,			0x18),
+	ALI_DRW_PMU_EVENT_ATTR(waw_hazard,			0x19),
+	ALI_DRW_PMU_EVENT_ATTR(op_is_enter_selfref_rk0,		0x1A),
+	ALI_DRW_PMU_EVENT_ATTR(op_is_enter_selfref_rk1,		0x1B),
+	ALI_DRW_PMU_EVENT_ATTR(op_is_enter_selfref_rk2,		0x1C),
+	ALI_DRW_PMU_EVENT_ATTR(op_is_enter_selfref_rk3,		0x1D),
+	ALI_DRW_PMU_EVENT_ATTR(op_is_enter_powerdown_rk0,	0x1E),
+	ALI_DRW_PMU_EVENT_ATTR(op_is_enter_powerdown_rk1,	0x1F),
+	ALI_DRW_PMU_EVENT_ATTR(op_is_enter_powerdown_rk2,	0x20),
+	ALI_DRW_PMU_EVENT_ATTR(op_is_enter_powerdown_rk3,	0x21),
+	ALI_DRW_PMU_EVENT_ATTR(selfref_mode_rk0,		0x26),
+	ALI_DRW_PMU_EVENT_ATTR(selfref_mode_rk1,		0x27),
+	ALI_DRW_PMU_EVENT_ATTR(selfref_mode_rk2,		0x28),
+	ALI_DRW_PMU_EVENT_ATTR(selfref_mode_rk3,		0x29),
+	ALI_DRW_PMU_EVENT_ATTR(op_is_refresh,			0x2A),
+	ALI_DRW_PMU_EVENT_ATTR(op_is_crit_ref,			0x2B),
+	ALI_DRW_PMU_EVENT_ATTR(op_is_load_mode,			0x2D),
+	ALI_DRW_PMU_EVENT_ATTR(op_is_zqcl,			0x2E),
+	ALI_DRW_PMU_EVENT_ATTR(visible_window_limit_reached_rd, 0x30),
+	ALI_DRW_PMU_EVENT_ATTR(visible_window_limit_reached_wr, 0x31),
+	ALI_DRW_PMU_EVENT_ATTR(op_is_dqsosc_mpc,		0x34),
+	ALI_DRW_PMU_EVENT_ATTR(op_is_dqsosc_mrr,		0x35),
+	ALI_DRW_PMU_EVENT_ATTR(op_is_tcr_mrr,			0x36),
+	ALI_DRW_PMU_EVENT_ATTR(op_is_zqstart,			0x37),
+	ALI_DRW_PMU_EVENT_ATTR(op_is_zqlatch,			0x38),
+	ALI_DRW_PMU_EVENT_ATTR(chi_txreq,			0x39),
+	ALI_DRW_PMU_EVENT_ATTR(chi_txdat,			0x3A),
+	ALI_DRW_PMU_EVENT_ATTR(chi_rxdat,			0x3B),
+	ALI_DRW_PMU_EVENT_ATTR(chi_rxrsp,			0x3C),
+	ALI_DRW_PMU_EVENT_ATTR(tsz_vio,				0x3D),
+	ALI_DRW_PMU_EVENT_ATTR(cycle,				0x80),
+	NULL,
+};
+
+static struct attribute_group ali_drw_pmu_events_attr_group = {
+	.name = "events",
+	.attrs = ali_drw_pmu_events_attrs,
+};
+
+static struct attribute *ali_drw_pmu_format_attr[] = {
+	ALI_DRW_PMU_FORMAT_ATTR(event, "config:0-7"),
+	NULL,
+};
+
+static const struct attribute_group ali_drw_pmu_format_group = {
+	.name = "format",
+	.attrs = ali_drw_pmu_format_attr,
+};
+
+static ssize_t ali_drw_pmu_cpumask_show(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct ali_drw_pmu *drw_pmu = to_ali_drw_pmu(dev_get_drvdata(dev));
+
+	return cpumap_print_to_pagebuf(true, buf, cpumask_of(drw_pmu->cpu));
+}
+
+static struct device_attribute ali_drw_pmu_cpumask_attr =
+		__ATTR(cpumask, 0444, ali_drw_pmu_cpumask_show, NULL);
+
+static struct attribute *ali_drw_pmu_cpumask_attrs[] = {
+	&ali_drw_pmu_cpumask_attr.attr,
+	NULL,
+};
+
+static const struct attribute_group ali_drw_pmu_cpumask_attr_group = {
+	.attrs = ali_drw_pmu_cpumask_attrs,
+};
+
+static const struct attribute_group *ali_drw_pmu_attr_groups[] = {
+	&ali_drw_pmu_events_attr_group,
+	&ali_drw_pmu_cpumask_attr_group,
+	&ali_drw_pmu_format_group,
+	NULL,
+};
+
+/* find a counter for event, then in add func, hw.idx will equal to counter */
+static int ali_drw_get_counter_idx(struct perf_event *event)
+{
+	struct ali_drw_pmu *drw_pmu = to_ali_drw_pmu(event->pmu);
+	int idx;
+
+	for (idx = 0; idx < ALI_DRW_PMU_COMMON_MAX_COUNTERS; ++idx) {
+		if (!test_and_set_bit(idx, drw_pmu->used_mask))
+			return idx;
+	}
+
+	/* The counters are all in use. */
+	return -EBUSY;
+}
+
+static u64 ali_drw_pmu_read_counter(struct perf_event *event)
+{
+	struct ali_drw_pmu *drw_pmu = to_ali_drw_pmu(event->pmu);
+	u64 cycle_high, cycle_low;
+
+	if (GET_DRW_EVENTID(event) == ALI_DRW_PMU_CYCLE_EVT_ID) {
+		cycle_high = readl(drw_pmu->cfg_base + ALI_DRW_PMU_CYCLE_CNT_HIGH);
+		cycle_high &= ALI_DRW_PMU_CYCLE_CNT_HIGH_MASK;
+		cycle_low = readl(drw_pmu->cfg_base + ALI_DRW_PMU_CYCLE_CNT_LOW);
+		cycle_low &= ALI_DRW_PMU_CYCLE_CNT_LOW_MASK;
+		return (cycle_high << 32 | cycle_low);
+	}
+
+	return readl(drw_pmu->cfg_base +
+		     ALI_DRW_PMU_COMMON_COUNTERn(event->hw.idx));
+}
+
+static void ali_drw_pmu_event_update(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	u64 delta, prev, now;
+
+	do {
+		prev = local64_read(&hwc->prev_count);
+		now = ali_drw_pmu_read_counter(event);
+	} while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev);
+
+	/* handle overflow. */
+	delta = now - prev;
+	if (GET_DRW_EVENTID(event) == ALI_DRW_PMU_CYCLE_EVT_ID)
+		delta &= ALI_DRW_PMU_OV_INTR_MASK;
+	else
+		delta &= ALI_DRW_CNT_MAX_PERIOD;
+	local64_add(delta, &event->count);
+}
+
+static void ali_drw_pmu_event_set_period(struct perf_event *event)
+{
+	u64 pre_val;
+	struct ali_drw_pmu *drw_pmu = to_ali_drw_pmu(event->pmu);
+
+	/* set a preload counter for test purpose */
+	writel(ALI_DRW_PMU_TEST_SEL_COMMON_COUNTER_BASE + event->hw.idx,
+	       drw_pmu->cfg_base + ALI_DRW_PMU_TEST_CTRL);
+
+	/* set conunter initial value */
+	pre_val = ALI_DRW_PMU_CNT_INIT;
+	writel(pre_val, drw_pmu->cfg_base + ALI_DRW_PMU_CNT_PRELOAD);
+	local64_set(&event->hw.prev_count, pre_val);
+
+	/* set sel mode to zero to start test */
+	writel(0x0, drw_pmu->cfg_base + ALI_DRW_PMU_TEST_CTRL);
+}
+
+static void ali_drw_pmu_enable_counter(struct perf_event *event)
+{
+	u32 val, subval, reg, shift;
+	int counter = event->hw.idx;
+	struct ali_drw_pmu *drw_pmu = to_ali_drw_pmu(event->pmu);
+
+	reg = ALI_DRW_PMU_EVENT_SELn(counter);
+	val = readl(drw_pmu->cfg_base + reg);
+	subval = FIELD_PREP(ALI_DRW_PMCOM_CNT_EN, 1) |
+		 FIELD_PREP(ALI_DRW_PMCOM_CNT_EVENT_MASK, drw_pmu->evtids[counter]);
+
+	shift = ALI_DRW_PMCOM_CNT_EVENT_OFFSET(counter);
+	val &= ~(GENMASK(7, 0) << shift);
+	val |= subval << shift;
+
+	writel(val, drw_pmu->cfg_base + reg);
+}
+
+static void ali_drw_pmu_disable_counter(struct perf_event *event)
+{
+	u32 val, reg, subval, shift;
+	struct ali_drw_pmu *drw_pmu = to_ali_drw_pmu(event->pmu);
+	int counter = event->hw.idx;
+
+	reg = ALI_DRW_PMU_EVENT_SELn(counter);
+	val = readl(drw_pmu->cfg_base + reg);
+	subval = FIELD_PREP(ALI_DRW_PMCOM_CNT_EN, 0) |
+		 FIELD_PREP(ALI_DRW_PMCOM_CNT_EVENT_MASK, 0);
+
+	shift = ALI_DRW_PMCOM_CNT_EVENT_OFFSET(counter);
+	val &= ~(GENMASK(7, 0) << shift);
+	val |= subval << shift;
+
+	writel(val, drw_pmu->cfg_base + reg);
+}
+
+static irqreturn_t ali_drw_pmu_isr(int irq_num, void *data)
+{
+	struct ali_drw_pmu_irq *irq = data;
+	struct ali_drw_pmu *drw_pmu;
+	irqreturn_t ret = IRQ_NONE;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(drw_pmu, &irq->pmus_node, pmus_node) {
+		unsigned long status, clr_status;
+		struct perf_event *event;
+		unsigned int idx;
+
+		for (idx = 0; idx < ALI_DRW_PMU_COMMON_MAX_COUNTERS; idx++) {
+			event = drw_pmu->events[idx];
+			if (!event)
+				continue;
+			ali_drw_pmu_disable_counter(event);
+		}
+
+		/* common counter intr status */
+		status = readl(drw_pmu->cfg_base + ALI_DRW_PMU_OV_INTR_STATUS);
+		status = FIELD_GET(ALI_DRW_PMCOM_CNT_OV_INTR_MASK, status);
+		if (status) {
+			for_each_set_bit(idx, &status,
+					 ALI_DRW_PMU_COMMON_MAX_COUNTERS) {
+				event = drw_pmu->events[idx];
+				if (WARN_ON_ONCE(!event))
+					continue;
+				ali_drw_pmu_event_update(event);
+				ali_drw_pmu_event_set_period(event);
+			}
+
+			/* clear common counter intr status */
+			clr_status = FIELD_PREP(ALI_DRW_PMCOM_CNT_OV_INTR_MASK, 1);
+			writel(clr_status,
+			       drw_pmu->cfg_base + ALI_DRW_PMU_OV_INTR_CLR);
+		}
+
+		for (idx = 0; idx < ALI_DRW_PMU_COMMON_MAX_COUNTERS; idx++) {
+			event = drw_pmu->events[idx];
+			if (!event)
+				continue;
+			if (!(event->hw.state & PERF_HES_STOPPED))
+				ali_drw_pmu_enable_counter(event);
+		}
+		if (status)
+			ret = IRQ_HANDLED;
+	}
+	rcu_read_unlock();
+	return ret;
+}
+
+static struct ali_drw_pmu_irq *__ali_drw_pmu_init_irq(struct platform_device
+						      *pdev, int irq_num)
+{
+	int ret;
+	struct ali_drw_pmu_irq *irq;
+
+	list_for_each_entry(irq, &ali_drw_pmu_irqs, irqs_node) {
+		if (irq->irq_num == irq_num
+		    && refcount_inc_not_zero(&irq->refcount))
+			return irq;
+	}
+
+	irq = kzalloc(sizeof(*irq), GFP_KERNEL);
+	if (!irq)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&irq->pmus_node);
+
+	/* Pick one CPU to be the preferred one to use */
+	irq->cpu = smp_processor_id();
+	refcount_set(&irq->refcount, 1);
+
+	/*
+	 * FIXME: one of DDRSS Driveway PMU overflow interrupt shares the same
+	 * irq number with MPAM ERR_IRQ. To register DDRSS PMU and MPAM drivers
+	 * successfully, add IRQF_SHARED flag. Howerer, PMU interrupt should not
+	 * share with other component.
+	 */
+	ret = devm_request_irq(&pdev->dev, irq_num, ali_drw_pmu_isr,
+			       IRQF_SHARED, dev_name(&pdev->dev), irq);
+	if (ret < 0) {
+		dev_err(&pdev->dev,
+			"Fail to request IRQ:%d ret:%d\n", irq_num, ret);
+		goto out_free;
+	}
+
+	ret = irq_set_affinity_hint(irq_num, cpumask_of(irq->cpu));
+	if (ret)
+		goto out_free;
+
+	ret = cpuhp_state_add_instance_nocalls(ali_drw_cpuhp_state_num,
+					     &irq->node);
+	if (ret)
+		goto out_free;
+
+	irq->irq_num = irq_num;
+	list_add(&irq->irqs_node, &ali_drw_pmu_irqs);
+
+	return irq;
+
+out_free:
+	kfree(irq);
+	return ERR_PTR(ret);
+}
+
+static int ali_drw_pmu_init_irq(struct ali_drw_pmu *drw_pmu,
+				struct platform_device *pdev)
+{
+	int irq_num;
+	struct ali_drw_pmu_irq *irq;
+
+	/* Read and init IRQ */
+	irq_num = platform_get_irq(pdev, 0);
+	if (irq_num < 0)
+		return irq_num;
+
+	mutex_lock(&ali_drw_pmu_irqs_lock);
+	irq = __ali_drw_pmu_init_irq(pdev, irq_num);
+	mutex_unlock(&ali_drw_pmu_irqs_lock);
+
+	if (IS_ERR(irq))
+		return PTR_ERR(irq);
+
+	drw_pmu->irq = irq;
+
+	mutex_lock(&ali_drw_pmu_irqs_lock);
+	list_add_rcu(&drw_pmu->pmus_node, &irq->pmus_node);
+	mutex_unlock(&ali_drw_pmu_irqs_lock);
+
+	return 0;
+}
+
+static void ali_drw_pmu_uninit_irq(struct ali_drw_pmu *drw_pmu)
+{
+	struct ali_drw_pmu_irq *irq = drw_pmu->irq;
+
+	mutex_lock(&ali_drw_pmu_irqs_lock);
+	list_del_rcu(&drw_pmu->pmus_node);
+
+	if (!refcount_dec_and_test(&irq->refcount)) {
+		mutex_unlock(&ali_drw_pmu_irqs_lock);
+		return;
+	}
+
+	list_del(&irq->irqs_node);
+	mutex_unlock(&ali_drw_pmu_irqs_lock);
+
+	WARN_ON(irq_set_affinity_hint(irq->irq_num, NULL));
+	cpuhp_state_remove_instance_nocalls(ali_drw_cpuhp_state_num,
+					    &irq->node);
+	kfree(irq);
+}
+
+static int ali_drw_pmu_event_init(struct perf_event *event)
+{
+	struct ali_drw_pmu *drw_pmu = to_ali_drw_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	struct perf_event *sibling;
+	struct device *dev = drw_pmu->pmu.dev;
+
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	if (is_sampling_event(event)) {
+		dev_err(dev, "Sampling not supported!\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (event->attach_state & PERF_ATTACH_TASK) {
+		dev_err(dev, "Per-task counter cannot allocate!\n");
+		return -EOPNOTSUPP;
+	}
+
+	event->cpu = drw_pmu->cpu;
+	if (event->cpu < 0) {
+		dev_err(dev, "Per-task mode not supported!\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (event->group_leader != event &&
+	    !is_software_event(event->group_leader)) {
+		dev_err(dev, "driveway only allow one event!\n");
+		return -EINVAL;
+	}
+
+	for_each_sibling_event(sibling, event->group_leader) {
+		if (sibling != event && !is_software_event(sibling)) {
+			dev_err(dev, "driveway event not allowed!\n");
+			return -EINVAL;
+		}
+	}
+
+	/* reset all the pmu counters */
+	writel(ALI_DRW_PMU_CNT_RST, drw_pmu->cfg_base + ALI_DRW_PMU_CNT_CTRL);
+
+	hwc->idx = -1;
+
+	return 0;
+}
+
+static void ali_drw_pmu_start(struct perf_event *event, int flags)
+{
+	struct ali_drw_pmu *drw_pmu = to_ali_drw_pmu(event->pmu);
+
+	event->hw.state = 0;
+
+	if (GET_DRW_EVENTID(event) == ALI_DRW_PMU_CYCLE_EVT_ID) {
+		writel(ALI_DRW_PMU_CNT_START,
+		       drw_pmu->cfg_base + ALI_DRW_PMU_CNT_CTRL);
+		return;
+	}
+
+	ali_drw_pmu_event_set_period(event);
+	if (flags & PERF_EF_RELOAD) {
+		unsigned long prev_raw_count =
+		    local64_read(&event->hw.prev_count);
+		writel(prev_raw_count,
+		       drw_pmu->cfg_base + ALI_DRW_PMU_CNT_PRELOAD);
+	}
+
+	ali_drw_pmu_enable_counter(event);
+
+	writel(ALI_DRW_PMU_CNT_START, drw_pmu->cfg_base + ALI_DRW_PMU_CNT_CTRL);
+}
+
+static void ali_drw_pmu_stop(struct perf_event *event, int flags)
+{
+	struct ali_drw_pmu *drw_pmu = to_ali_drw_pmu(event->pmu);
+
+	if (event->hw.state & PERF_HES_STOPPED)
+		return;
+
+	if (GET_DRW_EVENTID(event) != ALI_DRW_PMU_CYCLE_EVT_ID)
+		ali_drw_pmu_disable_counter(event);
+
+	writel(ALI_DRW_PMU_CNT_STOP, drw_pmu->cfg_base + ALI_DRW_PMU_CNT_CTRL);
+
+	ali_drw_pmu_event_update(event);
+	event->hw.state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
+}
+
+static int ali_drw_pmu_add(struct perf_event *event, int flags)
+{
+	struct ali_drw_pmu *drw_pmu = to_ali_drw_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = -1;
+	int evtid;
+
+	evtid = GET_DRW_EVENTID(event);
+
+	if (evtid != ALI_DRW_PMU_CYCLE_EVT_ID) {
+		idx = ali_drw_get_counter_idx(event);
+		if (idx < 0)
+			return idx;
+		drw_pmu->events[idx] = event;
+		drw_pmu->evtids[idx] = evtid;
+	}
+	hwc->idx = idx;
+
+	hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
+
+	if (flags & PERF_EF_START)
+		ali_drw_pmu_start(event, PERF_EF_RELOAD);
+
+	/* Propagate our changes to the userspace mapping. */
+	perf_event_update_userpage(event);
+
+	return 0;
+}
+
+static void ali_drw_pmu_del(struct perf_event *event, int flags)
+{
+	struct ali_drw_pmu *drw_pmu = to_ali_drw_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
+
+	ali_drw_pmu_stop(event, PERF_EF_UPDATE);
+
+	if (idx >= 0 && idx < ALI_DRW_PMU_COMMON_MAX_COUNTERS) {
+		drw_pmu->events[idx] = NULL;
+		drw_pmu->evtids[idx] = 0;
+		clear_bit(idx, drw_pmu->used_mask);
+	}
+
+	perf_event_update_userpage(event);
+}
+
+static void ali_drw_pmu_read(struct perf_event *event)
+{
+	ali_drw_pmu_event_update(event);
+}
+
+static int ali_drw_pmu_probe(struct platform_device *pdev)
+{
+	struct ali_drw_pmu *drw_pmu;
+	struct resource *res;
+	char *name;
+	int ret;
+
+	drw_pmu = devm_kzalloc(&pdev->dev, sizeof(*drw_pmu), GFP_KERNEL);
+	if (!drw_pmu)
+		return -ENOMEM;
+
+	drw_pmu->dev = &pdev->dev;
+	platform_set_drvdata(pdev, drw_pmu);
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	drw_pmu->cfg_base = devm_ioremap_resource(&pdev->dev, res);
+	if (!drw_pmu->cfg_base)
+		return -ENOMEM;
+
+	name = devm_kasprintf(drw_pmu->dev, GFP_KERNEL, "ali_drw_%llx",
+			      (u64) (res->start >> ALI_DRW_PMU_PA_SHIFT));
+	if (!name)
+		return -ENOMEM;
+
+	writel(ALI_DRW_PMU_CNT_RST, drw_pmu->cfg_base + ALI_DRW_PMU_CNT_CTRL);
+
+	/* enable the generation of interrupt by all common counters */
+	writel(ALI_DRW_PMCOM_CNT_OV_INTR_MASK,
+	       drw_pmu->cfg_base + ALI_DRW_PMU_OV_INTR_ENABLE_CTL);
+
+	/* clearing interrupt status */
+	writel(0xffffff, drw_pmu->cfg_base + ALI_DRW_PMU_OV_INTR_CLR);
+
+	drw_pmu->cpu = smp_processor_id();
+
+	ret = ali_drw_pmu_init_irq(drw_pmu, pdev);
+	if (ret)
+		return ret;
+
+	drw_pmu->pmu = (struct pmu) {
+		.module		= THIS_MODULE,
+		.task_ctx_nr	= perf_invalid_context,
+		.event_init	= ali_drw_pmu_event_init,
+		.add		= ali_drw_pmu_add,
+		.del		= ali_drw_pmu_del,
+		.start		= ali_drw_pmu_start,
+		.stop		= ali_drw_pmu_stop,
+		.read		= ali_drw_pmu_read,
+		.attr_groups	= ali_drw_pmu_attr_groups,
+		.capabilities	= PERF_PMU_CAP_NO_EXCLUDE,
+	};
+
+	ret = perf_pmu_register(&drw_pmu->pmu, name, -1);
+	if (ret) {
+		dev_err(drw_pmu->dev, "DRW Driveway PMU PMU register failed!\n");
+		ali_drw_pmu_uninit_irq(drw_pmu);
+	}
+
+	return ret;
+}
+
+static int ali_drw_pmu_remove(struct platform_device *pdev)
+{
+	struct ali_drw_pmu *drw_pmu = platform_get_drvdata(pdev);
+
+	/* disable the generation of interrupt by all common counters */
+	writel(ALI_DRW_PMCOM_CNT_OV_INTR_MASK,
+	       drw_pmu->cfg_base + ALI_DRW_PMU_OV_INTR_DISABLE_CTL);
+
+	ali_drw_pmu_uninit_irq(drw_pmu);
+	perf_pmu_unregister(&drw_pmu->pmu);
+
+	return 0;
+}
+
+static int ali_drw_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
+{
+	struct ali_drw_pmu_irq *irq;
+	struct ali_drw_pmu *drw_pmu;
+	unsigned int target;
+	int ret;
+	cpumask_t node_online_cpus;
+
+	irq = hlist_entry_safe(node, struct ali_drw_pmu_irq, node);
+	if (cpu != irq->cpu)
+		return 0;
+
+	ret = cpumask_and(&node_online_cpus,
+			  cpumask_of_node(cpu_to_node(cpu)), cpu_online_mask);
+	if (ret)
+		target = cpumask_any_but(&node_online_cpus, cpu);
+	else
+		target = cpumask_any_but(cpu_online_mask, cpu);
+
+	if (target >= nr_cpu_ids)
+		return 0;
+
+	/* We're only reading, but this isn't the place to be involving RCU */
+	mutex_lock(&ali_drw_pmu_irqs_lock);
+	list_for_each_entry(drw_pmu, &irq->pmus_node, pmus_node)
+		perf_pmu_migrate_context(&drw_pmu->pmu, irq->cpu, target);
+	mutex_unlock(&ali_drw_pmu_irqs_lock);
+
+	WARN_ON(irq_set_affinity_hint(irq->irq_num, cpumask_of(target)));
+	irq->cpu = target;
+
+	return 0;
+}
+
+/*
+ * Due to historical reasons, the HID used in the production environment is
+ * ARMHD700, so we leave ARMHD700 as Compatible ID.
+ */
+static const struct acpi_device_id ali_drw_acpi_match[] = {
+	{"BABA5000", 0},
+	{"ARMHD700", 0},
+	{}
+};
+
+MODULE_DEVICE_TABLE(acpi, ali_drw_acpi_match);
+
+static struct platform_driver ali_drw_pmu_driver = {
+	.driver = {
+		   .name = "ali_drw_pmu",
+		   .acpi_match_table = ali_drw_acpi_match,
+		   },
+	.probe = ali_drw_pmu_probe,
+	.remove = ali_drw_pmu_remove,
+};
+
+static int __init ali_drw_pmu_init(void)
+{
+	int ret;
+
+	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
+				      "ali_drw_pmu:online",
+				      NULL, ali_drw_pmu_offline_cpu);
+
+	if (ret < 0) {
+		pr_err("DRW Driveway PMU: setup hotplug failed, ret = %d\n",
+		       ret);
+		return ret;
+	}
+	ali_drw_cpuhp_state_num = ret;
+
+	ret = platform_driver_register(&ali_drw_pmu_driver);
+	if (ret)
+		cpuhp_remove_multi_state(ali_drw_cpuhp_state_num);
+
+	return ret;
+}
+
+static void __exit ali_drw_pmu_exit(void)
+{
+	platform_driver_unregister(&ali_drw_pmu_driver);
+	cpuhp_remove_multi_state(ali_drw_cpuhp_state_num);
+}
+
+module_init(ali_drw_pmu_init);
+module_exit(ali_drw_pmu_exit);
+
+MODULE_AUTHOR("Hongbo Yao <yaohongbo@linux.alibaba.com>");
+MODULE_AUTHOR("Neng Chen <nengchen@linux.alibaba.com>");
+MODULE_AUTHOR("Shuai Xue <xueshuai@linux.alibaba.com>");
+MODULE_DESCRIPTION("Alibaba DDR Sub-System Driveway PMU driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index 59d3980b8ca2..3f07df5a7e95 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -894,7 +894,7 @@ static struct arm_pmu *__armpmu_alloc(gfp_t flags)
 		 * pmu::filter_match callback and pmu::event_init group
 		 * validation).
 		 */
-		.capabilities	= PERF_PMU_CAP_HETEROGENEOUS_CPUS,
+		.capabilities	= PERF_PMU_CAP_HETEROGENEOUS_CPUS | PERF_PMU_CAP_EXTENDED_REGS,
 	};
 
 	pmu->attr_groups[ARMPMU_ATTR_GROUP_COMMON] =
diff --git a/tools/testing/selftests/arm64/abi/.gitignore b/tools/testing/selftests/arm64/abi/.gitignore
index b9e54417250d..12607c4580c6 100644
--- a/tools/testing/selftests/arm64/abi/.gitignore
+++ b/tools/testing/selftests/arm64/abi/.gitignore
@@ -1,2 +1,3 @@
+ptrace
 syscall-abi
 tpidr2
diff --git a/tools/testing/selftests/arm64/abi/Makefile b/tools/testing/selftests/arm64/abi/Makefile
index c8d7f2495eb2..445ac2dac4ee 100644
--- a/tools/testing/selftests/arm64/abi/Makefile
+++ b/tools/testing/selftests/arm64/abi/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 # Copyright (C) 2021 ARM Limited
 
-TEST_GEN_PROGS := syscall-abi tpidr2
+TEST_GEN_PROGS := ptrace syscall-abi tpidr2
 
 include ../../lib.mk
 
diff --git a/tools/testing/selftests/arm64/abi/ptrace.c b/tools/testing/selftests/arm64/abi/ptrace.c
new file mode 100644
index 000000000000..be952511af22
--- /dev/null
+++ b/tools/testing/selftests/arm64/abi/ptrace.c
@@ -0,0 +1,241 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2022 ARM Limited.
+ */
+#include <errno.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/auxv.h>
+#include <sys/prctl.h>
+#include <sys/ptrace.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/wait.h>
+#include <asm/sigcontext.h>
+#include <asm/ptrace.h>
+
+#include "../../kselftest.h"
+
+#define EXPECTED_TESTS 7
+
+#define MAX_TPIDRS 2
+
+static bool have_sme(void)
+{
+	return getauxval(AT_HWCAP2) & HWCAP2_SME;
+}
+
+static void test_tpidr(pid_t child)
+{
+	uint64_t read_val[MAX_TPIDRS];
+	uint64_t write_val[MAX_TPIDRS];
+	struct iovec read_iov, write_iov;
+	bool test_tpidr2 = false;
+	int ret, i;
+
+	read_iov.iov_base = read_val;
+	write_iov.iov_base = write_val;
+
+	/* Should be able to read a single TPIDR... */
+	read_iov.iov_len = sizeof(uint64_t);
+	ret = ptrace(PTRACE_GETREGSET, child, NT_ARM_TLS, &read_iov);
+	ksft_test_result(ret == 0, "read_tpidr_one\n");
+
+	/* ...write a new value.. */
+	write_iov.iov_len = sizeof(uint64_t);
+	write_val[0] = read_val[0]++;
+	ret = ptrace(PTRACE_SETREGSET, child, NT_ARM_TLS, &write_iov);
+	ksft_test_result(ret == 0, "write_tpidr_one\n");
+
+	/* ...then read it back */
+	ret = ptrace(PTRACE_GETREGSET, child, NT_ARM_TLS, &read_iov);
+	ksft_test_result(ret == 0 && write_val[0] == read_val[0],
+			 "verify_tpidr_one\n");
+
+	/* If we have TPIDR2 we should be able to read it */
+	read_iov.iov_len = sizeof(read_val);
+	ret = ptrace(PTRACE_GETREGSET, child, NT_ARM_TLS, &read_iov);
+	if (ret == 0) {
+		/* If we have SME there should be two TPIDRs */
+		if (read_iov.iov_len >= sizeof(read_val))
+			test_tpidr2 = true;
+
+		if (have_sme() && test_tpidr2) {
+			ksft_test_result(test_tpidr2, "count_tpidrs\n");
+		} else {
+			ksft_test_result(read_iov.iov_len % sizeof(uint64_t) == 0,
+					 "count_tpidrs\n");
+		}
+	} else {
+		ksft_test_result_fail("count_tpidrs\n");
+	}
+
+	if (test_tpidr2) {
+		/* Try to write new values to all known TPIDRs... */
+		write_iov.iov_len = sizeof(write_val);
+		for (i = 0; i < MAX_TPIDRS; i++)
+			write_val[i] = read_val[i] + 1;
+		ret = ptrace(PTRACE_SETREGSET, child, NT_ARM_TLS, &write_iov);
+
+		ksft_test_result(ret == 0 &&
+				 write_iov.iov_len == sizeof(write_val),
+				 "tpidr2_write\n");
+
+		/* ...then read them back */
+		read_iov.iov_len = sizeof(read_val);
+		ret = ptrace(PTRACE_GETREGSET, child, NT_ARM_TLS, &read_iov);
+
+		if (have_sme()) {
+			/* Should read back the written value */
+			ksft_test_result(ret == 0 &&
+					 read_iov.iov_len >= sizeof(read_val) &&
+					 memcmp(read_val, write_val,
+						sizeof(read_val)) == 0,
+					 "tpidr2_read\n");
+		} else {
+			/* TPIDR2 should read as zero */
+			ksft_test_result(ret == 0 &&
+					 read_iov.iov_len >= sizeof(read_val) &&
+					 read_val[0] == write_val[0] &&
+					 read_val[1] == 0,
+					 "tpidr2_read\n");
+		}
+
+		/* Writing only TPIDR... */
+		write_iov.iov_len = sizeof(uint64_t);
+		memcpy(write_val, read_val, sizeof(read_val));
+		write_val[0] += 1;
+		ret = ptrace(PTRACE_SETREGSET, child, NT_ARM_TLS, &write_iov);
+
+		if (ret == 0) {
+			/* ...should leave TPIDR2 untouched */
+			read_iov.iov_len = sizeof(read_val);
+			ret = ptrace(PTRACE_GETREGSET, child, NT_ARM_TLS,
+				     &read_iov);
+
+			ksft_test_result(ret == 0 &&
+					 read_iov.iov_len >= sizeof(read_val) &&
+					 memcmp(read_val, write_val,
+						sizeof(read_val)) == 0,
+					 "write_tpidr_only\n");
+		} else {
+			ksft_test_result_fail("write_tpidr_only\n");
+		}
+	} else {
+		ksft_test_result_skip("tpidr2_write\n");
+		ksft_test_result_skip("tpidr2_read\n");
+		ksft_test_result_skip("write_tpidr_only\n");
+	}
+}
+
+static int do_child(void)
+{
+	if (ptrace(PTRACE_TRACEME, -1, NULL, NULL))
+		ksft_exit_fail_msg("PTRACE_TRACEME", strerror(errno));
+
+	if (raise(SIGSTOP))
+		ksft_exit_fail_msg("raise(SIGSTOP)", strerror(errno));
+
+	return EXIT_SUCCESS;
+}
+
+static int do_parent(pid_t child)
+{
+	int ret = EXIT_FAILURE;
+	pid_t pid;
+	int status;
+	siginfo_t si;
+
+	/* Attach to the child */
+	while (1) {
+		int sig;
+
+		pid = wait(&status);
+		if (pid == -1) {
+			perror("wait");
+			goto error;
+		}
+
+		/*
+		 * This should never happen but it's hard to flag in
+		 * the framework.
+		 */
+		if (pid != child)
+			continue;
+
+		if (WIFEXITED(status) || WIFSIGNALED(status))
+			ksft_exit_fail_msg("Child died unexpectedly\n");
+
+		if (!WIFSTOPPED(status))
+			goto error;
+
+		sig = WSTOPSIG(status);
+
+		if (ptrace(PTRACE_GETSIGINFO, pid, NULL, &si)) {
+			if (errno == ESRCH)
+				goto disappeared;
+
+			if (errno == EINVAL) {
+				sig = 0; /* bust group-stop */
+				goto cont;
+			}
+
+			ksft_test_result_fail("PTRACE_GETSIGINFO: %s\n",
+					      strerror(errno));
+			goto error;
+		}
+
+		if (sig == SIGSTOP && si.si_code == SI_TKILL &&
+		    si.si_pid == pid)
+			break;
+
+	cont:
+		if (ptrace(PTRACE_CONT, pid, NULL, sig)) {
+			if (errno == ESRCH)
+				goto disappeared;
+
+			ksft_test_result_fail("PTRACE_CONT: %s\n",
+					      strerror(errno));
+			goto error;
+		}
+	}
+
+	ksft_print_msg("Parent is %d, child is %d\n", getpid(), child);
+
+	test_tpidr(child);
+
+	ret = EXIT_SUCCESS;
+
+error:
+	kill(child, SIGKILL);
+
+disappeared:
+	return ret;
+}
+
+int main(void)
+{
+	int ret = EXIT_SUCCESS;
+	pid_t child;
+
+	srandom(getpid());
+
+	ksft_print_header();
+
+	ksft_set_plan(EXPECTED_TESTS);
+
+	child = fork();
+	if (!child)
+		return do_child();
+
+	if (do_parent(child))
+		ret = EXIT_FAILURE;
+
+	ksft_print_cnts();
+
+	return ret;
+}