From 2263a8d3eab0e1f07cb2e1ba556f4934acce5bd5 Mon Sep 17 00:00:00 2001
From: Jonathan Maple <jmaple@ciq.com>
Date: Tue, 22 Oct 2024 19:08:16 -0400
Subject: [PATCH 01/34] selftests/mm temporary fix of hmm infinite loop

jira SECO-170

In Rocky9 if you run ./run_vmtests.sh -t hmm it will fail and cause an
infinite loop on ASSERTs in FIXTURE_TEARDOWN()
This temporary fix is based on the discussion here
https://patchwork.kernel.org/project/linux-kselftest/patch/26017fe3-5ad7-6946-57db-e5ec48063ceb@suse.cz/#25046055

We will investigate further kselftest updates that will resolve the root
causes of this.

Signed-off-by: Jonathan Maple <jmaple@ciq.com>
---
 tools/testing/selftests/mm/hmm-tests.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tools/testing/selftests/mm/hmm-tests.c b/tools/testing/selftests/mm/hmm-tests.c
index d2cfc9b494a0e..6f75c54564176 100644
--- a/tools/testing/selftests/mm/hmm-tests.c
+++ b/tools/testing/selftests/mm/hmm-tests.c
@@ -159,6 +159,10 @@ FIXTURE_TEARDOWN(hmm)
 {
 	int ret = close(self->fd);
 
+	if (ret != 0) {
+		fprintf(stderr, "close returned (%d) fd is (%d)\n", ret, self->fd);
+		exit(1);
+	}
 	ASSERT_EQ(ret, 0);
 	self->fd = -1;
 }

From 7cf4867475f612477d048b7d8a327d3175d0aadf Mon Sep 17 00:00:00 2001
From: Jeremy Allison <jallison@ciq.com>
Date: Thu, 29 Aug 2024 16:58:53 -0700
Subject: [PATCH 02/34] SUSE: patch: crypto-ecdh-implement-FIPS-PCT.patch

Signed-off-by: Jeremy Allison <jallison@ciq.com>
---
 crypto/ecdh.c | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/crypto/ecdh.c b/crypto/ecdh.c
index fe8966511e9d7..af702cfefd22f 100644
--- a/crypto/ecdh.c
+++ b/crypto/ecdh.c
@@ -10,6 +10,7 @@
 #include <crypto/kpp.h>
 #include <crypto/ecdh.h>
 #include <linux/scatterlist.h>
+#include <linux/fips.h>
 #include "ecc.h"
 
 struct ecdh_ctx {
@@ -94,6 +95,36 @@ static int ecdh_compute_value(struct kpp_request *req)
 				       ctx->private_key, public_key);
 		buf = public_key;
 		nbytes = public_key_sz;
+
+		/*
+		 * SP800-56Arev3, 5.6.2.1.4: ("Owner Assurance of
+		 * Pair-wise Consistency"): recompute the public key
+		 * and check if the results match.
+		 */
+		if (fips_enabled) {
+			u64 *public_key_pct;
+
+			if (ret < 0)
+				goto free_all;
+
+			public_key_pct = kmalloc(public_key_sz, GFP_KERNEL);
+			if (!public_key_pct) {
+				ret = -ENOMEM;
+				goto free_all;
+			}
+
+			ret = ecc_make_pub_key(ctx->curve_id, ctx->ndigits,
+					       ctx->private_key,
+					       public_key_pct);
+			if (ret < 0) {
+				kfree(public_key_pct);
+				goto free_all;
+			}
+
+			if (memcmp(public_key, public_key_pct, public_key_sz))
+				panic("ECDH PCT failed in FIPS mode");
+			kfree(public_key_pct);
+		}
 	}
 
 	if (ret < 0)

From 5dbf0cc7a72c073c91d105c9cf83380f1c93802b Mon Sep 17 00:00:00 2001
From: Jason Rodriguez <jrodriguez@ciq.com>
Date: Mon, 30 Sep 2024 12:57:14 -0400
Subject: [PATCH 03/34] crypto: essiv - Zeroize keys on exit in
 essiv_aead_setkey()

In essiv_aead_setkey(), use the same logic as crypto_authenc_esn_setkey()
to zeroize keys on exit.

[Sultan: touched up commit message]

Signed-off-by: Jason Rodriguez <jrodriguez@ciq.com>
---
 crypto/essiv.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/crypto/essiv.c b/crypto/essiv.c
index 8bcc5bdcb2a95..ec81bdea25631 100644
--- a/crypto/essiv.c
+++ b/crypto/essiv.c
@@ -114,13 +114,16 @@ static int essiv_aead_setkey(struct crypto_aead *tfm, const u8 *key,
 	      crypto_shash_update(desc, keys.enckey, keys.enckeylen) ?:
 	      crypto_shash_finup(desc, keys.authkey, keys.authkeylen, salt);
 	if (err)
-		return err;
+		goto out;
 
 	crypto_cipher_clear_flags(tctx->essiv_cipher, CRYPTO_TFM_REQ_MASK);
 	crypto_cipher_set_flags(tctx->essiv_cipher, crypto_aead_get_flags(tfm) &
 						    CRYPTO_TFM_REQ_MASK);
-	return crypto_cipher_setkey(tctx->essiv_cipher, salt,
-				    crypto_shash_digestsize(tctx->hash));
+	err = crypto_cipher_setkey(tctx->essiv_cipher, salt,
+				   crypto_shash_digestsize(tctx->hash));
+out:
+	memzero_explicit(&keys, sizeof(keys));
+	return err;
 }
 
 static int essiv_aead_setauthsize(struct crypto_aead *tfm,

From 80976ba4af5c783dec2817a7c74a743704cc2be5 Mon Sep 17 00:00:00 2001
From: Jeremy Allison <jallison@ciq.com>
Date: Mon, 16 Jun 2025 13:34:27 -0700
Subject: [PATCH 04/34] crypto: jitter - replace LFSR with SHA3-256
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

        Using the kernel crypto API, the SHA3-256 algorithm is used as
        conditioning element to replace the LFSR in the Jitter RNG. All other
        parts of the Jitter RNG are unchanged.

        The application and use of the SHA-3 conditioning operation is identical
        to the user space Jitter RNG 3.4.0 by applying the following concept:

        - the Jitter RNG initializes a SHA-3 state which acts as the "entropy
          pool" when the Jitter RNG is allocated.

        - When a new time delta is obtained, it is inserted into the "entropy
          pool" with a SHA-3 update operation. Note, this operation in most of
          the cases is a simple memcpy() onto the SHA-3 stack.

        - To cause a true SHA-3 operation for each time delta operation, a
          second SHA-3 operation is performed hashing Jitter RNG status
          information. The final message digest is also inserted into the
          "entropy pool" with a SHA-3 update operation. Yet, this data is not
          considered to provide any entropy, but it shall stir the entropy pool.

        - To generate a random number, a SHA-3 final operation is performed to
          calculate a message digest followed by an immediate SHA-3 init to
          re-initialize the "entropy pool". The obtained message digest is one
          block of the Jitter RNG that is returned to the caller.

        Mathematically speaking, the random number generated by the Jitter RNG
        is:

        aux_t = SHA-3(Jitter RNG state data)

        Jitter RNG block = SHA-3(time_i || aux_i || time_(i-1) || aux_(i-1) ||
                                 ... || time_(i-255) || aux_(i-255))

        when assuming that the OSR = 1, i.e. the default value.

        This operation implies that the Jitter RNG has an output-blocksize of
        256 bits instead of the 64 bits of the LFSR-based Jitter RNG that is
        replaced with this patch.

        The patch also replaces the varying number of invocations of the
        conditioning function with one fixed number of invocations. The use
        of the conditioning function consistent with the userspace Jitter RNG
        library version 3.4.0.

        The code is tested with a system that exhibited the least amount of
        entropy generated by the Jitter RNG: the SiFive Unmatched RISC-V
        system. The measured entropy rate is well above the heuristically
        implied entropy value of 1 bit of entropy per time delta. On all other
        tested systems, the measured entropy rate is even higher by orders
        of magnitude. The measurement was performed using updated tooling
        provided with the user space Jitter RNG library test framework.

        The performance of the Jitter RNG with this patch is about en par
        with the performance of the Jitter RNG without the patch.

        Signed-off-by: Stephan Mueller <smueller@chronox.de>
        Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

            Back-port of commit bb897c55042e9330bcf88b4b13cbdd6f9fabdd5e
            Author: Stephan Müller <smueller@chronox.de>
            Date:   Fri Apr 21 08:08:04 2023 +0200

Signed-off-by: Jeremy Allison <jallison@ciq.com>
---
 crypto/Kconfig               |   1 +
 crypto/jitterentropy-kcapi.c | 183 +++++++++++++++++++++++++++++++----
 crypto/jitterentropy.c       | 143 +++++++++------------------
 crypto/jitterentropy.h       |  10 +-
 4 files changed, 218 insertions(+), 119 deletions(-)

diff --git a/crypto/Kconfig b/crypto/Kconfig
index b600f4356df34..2385d0064017a 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -2025,6 +2025,7 @@ config CRYPTO_ANSI_CPRNG
 	tristate "Pseudo Random Number Generation for Cryptographic modules"
 	select CRYPTO_AES
 	select CRYPTO_RNG
+	select CRYPTO_SHA3
 	help
 	  This option enables the generic pseudo random number generator
 	  for cryptographic modules.  Uses the Algorithm specified in
diff --git a/crypto/jitterentropy-kcapi.c b/crypto/jitterentropy-kcapi.c
index b9edfaa51b273..4b50cbc8a2faf 100644
--- a/crypto/jitterentropy-kcapi.c
+++ b/crypto/jitterentropy-kcapi.c
@@ -2,7 +2,7 @@
  * Non-physical true random number generator based on timing jitter --
  * Linux Kernel Crypto API specific code
  *
- * Copyright Stephan Mueller <smueller@chronox.de>, 2015
+ * Copyright Stephan Mueller <smueller@chronox.de>, 2015 - 2023
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -37,6 +37,8 @@
  * DAMAGE.
  */
 
+#include <crypto/hash.h>
+#include <crypto/sha3.h>
 #include <linux/fips.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
@@ -46,6 +48,8 @@
 
 #include "jitterentropy.h"
 
+#define JENT_CONDITIONING_HASH	"sha3-256-generic"
+
 /***************************************************************************
  * Helper function
  ***************************************************************************/
@@ -60,11 +64,6 @@ void jent_zfree(void *ptr)
 	kfree_sensitive(ptr);
 }
 
-void jent_memcpy(void *dest, const void *src, unsigned int n)
-{
-	memcpy(dest, src, n);
-}
-
 /*
  * Obtain a high-resolution time stamp value. The time stamp is used to measure
  * the execution time of a given code path and its variations. Hence, the time
@@ -91,6 +90,91 @@ void jent_get_nstime(__u64 *out)
 	*out = tmp;
 }
 
+int jent_hash_time(void *hash_state, __u64 time, u8 *addtl,
+		   unsigned int addtl_len, __u64 hash_loop_cnt,
+		   unsigned int stuck)
+{
+	struct shash_desc *hash_state_desc = (struct shash_desc *)hash_state;
+	SHASH_DESC_ON_STACK(desc, hash_state_desc->tfm);
+	u8 intermediary[SHA3_256_DIGEST_SIZE];
+	__u64 j = 0;
+	int ret;
+
+	desc->tfm = hash_state_desc->tfm;
+
+	if (sizeof(intermediary) != crypto_shash_digestsize(desc->tfm)) {
+		pr_warn_ratelimited("Unexpected digest size\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * This loop fills a buffer which is injected into the entropy pool.
+	 * The main reason for this loop is to execute something over which we
+	 * can perform a timing measurement. The injection of the resulting
+	 * data into the pool is performed to ensure the result is used and
+	 * the compiler cannot optimize the loop away in case the result is not
+	 * used at all. Yet that data is considered "additional information"
+	 * considering the terminology from SP800-90A without any entropy.
+	 *
+	 * Note, it does not matter which or how much data you inject, we are
+	 * interested in one Keccack1600 compression operation performed with
+	 * the crypto_shash_final.
+	 */
+	for (j = 0; j < hash_loop_cnt; j++) {
+		ret = crypto_shash_init(desc) ?:
+		      crypto_shash_update(desc, intermediary,
+					  sizeof(intermediary)) ?:
+		      crypto_shash_finup(desc, addtl, addtl_len, intermediary);
+		if (ret)
+			goto err;
+	}
+
+	/*
+	 * Inject the data from the previous loop into the pool. This data is
+	 * not considered to contain any entropy, but it stirs the pool a bit.
+	 */
+	ret = crypto_shash_update(desc, intermediary, sizeof(intermediary));
+	if (ret)
+		goto err;
+
+	/*
+	 * Insert the time stamp into the hash context representing the pool.
+	 *
+	 * If the time stamp is stuck, do not finally insert the value into the
+	 * entropy pool. Although this operation should not do any harm even
+	 * when the time stamp has no entropy, SP800-90B requires that any
+	 * conditioning operation to have an identical amount of input data
+	 * according to section 3.1.5.
+	 */
+	if (!stuck) {
+		ret = crypto_shash_update(hash_state_desc, (u8 *)&time,
+					  sizeof(__u64));
+	}
+
+err:
+	shash_desc_zero(desc);
+	memzero_explicit(intermediary, sizeof(intermediary));
+
+	return ret;
+}
+
+int jent_read_random_block(void *hash_state, char *dst, unsigned int dst_len)
+{
+	struct shash_desc *hash_state_desc = (struct shash_desc *)hash_state;
+	u8 jent_block[SHA3_256_DIGEST_SIZE];
+	/* Obtain data from entropy pool and re-initialize it */
+	int ret = crypto_shash_final(hash_state_desc, jent_block) ?:
+		  crypto_shash_init(hash_state_desc) ?:
+		  crypto_shash_update(hash_state_desc, jent_block,
+				      sizeof(jent_block));
+
+	if (!ret && dst_len)
+		memcpy(dst, jent_block, dst_len);
+
+	memzero_explicit(jent_block, sizeof(jent_block));
+	return ret;
+}
+
 /***************************************************************************
  * Kernel crypto API interface
  ***************************************************************************/
@@ -98,32 +182,82 @@ void jent_get_nstime(__u64 *out)
 struct jitterentropy {
 	spinlock_t jent_lock;
 	struct rand_data *entropy_collector;
+	struct crypto_shash *tfm;
+	struct shash_desc *sdesc;
 };
 
-static int jent_kcapi_init(struct crypto_tfm *tfm)
+static void jent_kcapi_cleanup(struct crypto_tfm *tfm)
 {
 	struct jitterentropy *rng = crypto_tfm_ctx(tfm);
-	int ret = 0;
 
-	rng->entropy_collector = jent_entropy_collector_alloc(1, 0);
-	if (!rng->entropy_collector)
-		ret = -ENOMEM;
+	spin_lock(&rng->jent_lock);
 
-	spin_lock_init(&rng->jent_lock);
-	return ret;
-}
+	if (rng->sdesc) {
+		shash_desc_zero(rng->sdesc);
+		kfree(rng->sdesc);
+	}
+	rng->sdesc = NULL;
 
-static void jent_kcapi_cleanup(struct crypto_tfm *tfm)
-{
-	struct jitterentropy *rng = crypto_tfm_ctx(tfm);
+	if (rng->tfm)
+		crypto_free_shash(rng->tfm);
+	rng->tfm = NULL;
 
-	spin_lock(&rng->jent_lock);
 	if (rng->entropy_collector)
 		jent_entropy_collector_free(rng->entropy_collector);
 	rng->entropy_collector = NULL;
 	spin_unlock(&rng->jent_lock);
 }
 
+static int jent_kcapi_init(struct crypto_tfm *tfm)
+{
+	struct jitterentropy *rng = crypto_tfm_ctx(tfm);
+	struct crypto_shash *hash;
+	struct shash_desc *sdesc;
+	int size, ret = 0;
+
+	spin_lock_init(&rng->jent_lock);
+
+	/*
+	 * Use SHA3-256 as conditioner. We allocate only the generic
+	 * implementation as we are not interested in high-performance. The
+	 * execution time of the SHA3 operation is measured and adds to the
+	 * Jitter RNG's unpredictable behavior. If we have a slower hash
+	 * implementation, the execution timing variations are larger. When
+	 * using a fast implementation, we would need to call it more often
+	 * as its variations are lower.
+	 */
+	hash = crypto_alloc_shash(JENT_CONDITIONING_HASH, 0, 0);
+	if (IS_ERR(hash)) {
+		pr_err("Cannot allocate conditioning digest\n");
+		return PTR_ERR(hash);
+	}
+	rng->tfm = hash;
+
+	size = sizeof(struct shash_desc) + crypto_shash_descsize(hash);
+	sdesc = kmalloc(size, GFP_KERNEL);
+	if (!sdesc) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	sdesc->tfm = hash;
+	crypto_shash_init(sdesc);
+	rng->sdesc = sdesc;
+
+	rng->entropy_collector = jent_entropy_collector_alloc(1, 0, sdesc);
+	if (!rng->entropy_collector) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	spin_lock_init(&rng->jent_lock);
+	return 0;
+
+err:
+	jent_kcapi_cleanup(tfm);
+	return ret;
+}
+
 static int jent_kcapi_random(struct crypto_rng *tfm,
 			     const u8 *src, unsigned int slen,
 			     u8 *rdata, unsigned int dlen)
@@ -180,15 +314,24 @@ static struct rng_alg jent_alg = {
 		.cra_module             = THIS_MODULE,
 		.cra_init               = jent_kcapi_init,
 		.cra_exit               = jent_kcapi_cleanup,
-
 	}
 };
 
 static int __init jent_mod_init(void)
 {
+	SHASH_DESC_ON_STACK(desc, tfm);
+	struct crypto_shash *tfm;
 	int ret = 0;
 
-	ret = jent_entropy_init();
+	tfm = crypto_alloc_shash(JENT_CONDITIONING_HASH, 0, 0);
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	desc->tfm = tfm;
+	crypto_shash_init(desc);
+	ret = jent_entropy_init(desc);
+	shash_desc_zero(desc);
+	crypto_free_shash(tfm);
 	if (ret) {
 		/* Handle permanent health test error */
 		if (fips_enabled)
diff --git a/crypto/jitterentropy.c b/crypto/jitterentropy.c
index 227cedfa4f0ae..5b224d3d7442e 100644
--- a/crypto/jitterentropy.c
+++ b/crypto/jitterentropy.c
@@ -2,7 +2,7 @@
  * Non-physical true random number generator based on timing jitter --
  * Jitter RNG standalone code.
  *
- * Copyright Stephan Mueller <smueller@chronox.de>, 2015 - 2020
+ * Copyright Stephan Mueller <smueller@chronox.de>, 2015 - 2023
  *
  * Design
  * ======
@@ -57,21 +57,22 @@
 typedef	unsigned long long	__u64;
 typedef	long long		__s64;
 typedef	unsigned int		__u32;
+typedef unsigned char		u8;
 #define NULL    ((void *) 0)
 
 /* The entropy pool */
 struct rand_data {
+	/* SHA3-256 is used as conditioner */
+#define DATA_SIZE_BITS 256
 	/* all data values that are vital to maintain the security
 	 * of the RNG are marked as SENSITIVE. A user must not
 	 * access that information while the RNG executes its loops to
 	 * calculate the next random value. */
-	__u64 data;		/* SENSITIVE Actual random number */
-	__u64 old_data;		/* SENSITIVE Previous random number */
-	__u64 prev_time;	/* SENSITIVE Previous time stamp */
-#define DATA_SIZE_BITS ((sizeof(__u64)) * 8)
-	__u64 last_delta;	/* SENSITIVE stuck test */
-	__s64 last_delta2;	/* SENSITIVE stuck test */
-	unsigned int osr;	/* Oversample rate */
+	void *hash_state;		/* SENSITIVE hash state entropy pool */
+	__u64 prev_time;		/* SENSITIVE Previous time stamp */
+	__u64 last_delta;		/* SENSITIVE stuck test */
+	__s64 last_delta2;		/* SENSITIVE stuck test */
+	unsigned int osr;		/* Oversample rate */
 #define JENT_MEMORY_BLOCKS 64
 #define JENT_MEMORY_BLOCKSIZE 32
 #define JENT_MEMORY_ACCESSLOOPS 128
@@ -301,15 +302,13 @@ static int jent_permanent_health_failure(struct rand_data *ec)
  * an entropy collection.
  *
  * Input:
- * @ec entropy collector struct -- may be NULL
  * @bits is the number of low bits of the timer to consider
  * @min is the number of bits we shift the timer value to the right at
  *	the end to make sure we have a guaranteed minimum value
  *
  * @return Newly calculated loop counter
  */
-static __u64 jent_loop_shuffle(struct rand_data *ec,
-			       unsigned int bits, unsigned int min)
+static __u64 jent_loop_shuffle(unsigned int bits, unsigned int min)
 {
 	__u64 time = 0;
 	__u64 shuffle = 0;
@@ -317,12 +316,7 @@ static __u64 jent_loop_shuffle(struct rand_data *ec,
 	unsigned int mask = (1<<bits) - 1;
 
 	jent_get_nstime(&time);
-	/*
-	 * Mix the current state of the random number into the shuffle
-	 * calculation to balance that shuffle a bit more.
-	 */
-	if (ec)
-		time ^= ec->data;
+
 	/*
 	 * We fold the time value as much as possible to ensure that as many
 	 * bits of the time stamp are included as possible.
@@ -344,81 +338,32 @@ static __u64 jent_loop_shuffle(struct rand_data *ec,
  *			      execution time jitter
  *
  * This function injects the individual bits of the time value into the
- * entropy pool using an LFSR.
+ * entropy pool using a hash.
  *
- * The code is deliberately inefficient with respect to the bit shifting
- * and shall stay that way. This function is the root cause why the code
- * shall be compiled without optimization. This function not only acts as
- * folding operation, but this function's execution is used to measure
- * the CPU execution time jitter. Any change to the loop in this function
- * implies that careful retesting must be done.
- *
- * @ec [in] entropy collector struct
- * @time [in] time stamp to be injected
- * @loop_cnt [in] if a value not equal to 0 is set, use the given value as
- *		  number of loops to perform the folding
- * @stuck [in] Is the time stamp identified as stuck?
+ * ec [in] entropy collector
+ * time [in] time stamp to be injected
+ * stuck [in] Is the time stamp identified as stuck?
  *
  * Output:
- * updated ec->data
- *
- * @return Number of loops the folding operation is performed
+ * updated hash context in the entropy collector or error code
  */
-static void jent_lfsr_time(struct rand_data *ec, __u64 time, __u64 loop_cnt,
-			   int stuck)
+static int jent_condition_data(struct rand_data *ec, __u64 time, int stuck)
 {
-	unsigned int i;
-	__u64 j = 0;
-	__u64 new = 0;
-#define MAX_FOLD_LOOP_BIT 4
-#define MIN_FOLD_LOOP_BIT 0
-	__u64 fold_loop_cnt =
-		jent_loop_shuffle(ec, MAX_FOLD_LOOP_BIT, MIN_FOLD_LOOP_BIT);
-
-	/*
-	 * testing purposes -- allow test app to set the counter, not
-	 * needed during runtime
-	 */
-	if (loop_cnt)
-		fold_loop_cnt = loop_cnt;
-	for (j = 0; j < fold_loop_cnt; j++) {
-		new = ec->data;
-		for (i = 1; (DATA_SIZE_BITS) >= i; i++) {
-			__u64 tmp = time << (DATA_SIZE_BITS - i);
-
-			tmp = tmp >> (DATA_SIZE_BITS - 1);
-
-			/*
-			* Fibonacci LSFR with polynomial of
-			*  x^64 + x^61 + x^56 + x^31 + x^28 + x^23 + 1 which is
-			*  primitive according to
-			*   http://poincare.matf.bg.ac.rs/~ezivkovm/publications/primpol1.pdf
-			* (the shift values are the polynomial values minus one
-			* due to counting bits from 0 to 63). As the current
-			* position is always the LSB, the polynomial only needs
-			* to shift data in from the left without wrap.
-			*/
-			tmp ^= ((new >> 63) & 1);
-			tmp ^= ((new >> 60) & 1);
-			tmp ^= ((new >> 55) & 1);
-			tmp ^= ((new >> 30) & 1);
-			tmp ^= ((new >> 27) & 1);
-			tmp ^= ((new >> 22) & 1);
-			new <<= 1;
-			new ^= tmp;
-		}
-	}
-
-	/*
-	 * If the time stamp is stuck, do not finally insert the value into
-	 * the entropy pool. Although this operation should not do any harm
-	 * even when the time stamp has no entropy, SP800-90B requires that
-	 * any conditioning operation (SP800-90B considers the LFSR to be a
-	 * conditioning operation) to have an identical amount of input
-	 * data according to section 3.1.5.
-	 */
-	if (!stuck)
-		ec->data = new;
+#define SHA3_HASH_LOOP (1<<3)
+	struct {
+		int rct_count;
+		unsigned int apt_observations;
+		unsigned int apt_count;
+		unsigned int apt_base;
+	} addtl = {
+		ec->rct_count,
+		ec->apt_observations,
+		ec->apt_count,
+		ec->apt_base
+	};
+
+	return jent_hash_time(ec->hash_state, time, (u8 *)&addtl, sizeof(addtl),
+			      SHA3_HASH_LOOP, stuck);
 }
 
 /*
@@ -452,7 +397,7 @@ static void jent_memaccess(struct rand_data *ec, __u64 loop_cnt)
 #define MAX_ACC_LOOP_BIT 7
 #define MIN_ACC_LOOP_BIT 0
 	__u64 acc_loop_cnt =
-		jent_loop_shuffle(ec, MAX_ACC_LOOP_BIT, MIN_ACC_LOOP_BIT);
+		jent_loop_shuffle(MAX_ACC_LOOP_BIT, MIN_ACC_LOOP_BIT);
 
 	if (NULL == ec || NULL == ec->mem)
 		return;
@@ -520,14 +465,15 @@ static int jent_measure_jitter(struct rand_data *ec)
 	stuck = jent_stuck(ec, current_delta);
 
 	/* Now call the next noise sources which also injects the data */
-	jent_lfsr_time(ec, current_delta, 0, stuck);
+	if (jent_condition_data(ec, current_delta, stuck))
+		stuck = 1;
 
 	return stuck;
 }
 
 /*
  * Generator of one 64 bit random number
- * Function fills rand_data->data
+ * Function fills rand_data->hash_state
  *
  * @ec [in] Reference to entropy collector
  */
@@ -574,7 +520,7 @@ static void jent_gen_entropy(struct rand_data *ec)
  * @return 0 when request is fulfilled or an error
  *
  * The following error codes can occur:
- *	-1	entropy_collector is NULL
+ *	-1	entropy_collector is NULL or the generation failed
  *	-2	Intermittent health failure
  *	-3	Permanent health failure
  */
@@ -604,7 +550,7 @@ int jent_read_entropy(struct rand_data *ec, unsigned char *data,
 			 * Perform startup health tests and return permanent
 			 * error if it fails.
 			 */
-			if (jent_entropy_init())
+			if (jent_entropy_init(ec->hash_state))
 				return -3;
 
 			return -2;
@@ -614,7 +560,8 @@ int jent_read_entropy(struct rand_data *ec, unsigned char *data,
 			tocopy = (DATA_SIZE_BITS / 8);
 		else
 			tocopy = len;
-		jent_memcpy(p, &ec->data, tocopy);
+		if (jent_read_random_block(ec->hash_state, p, tocopy))
+			return -1;
 
 		len -= tocopy;
 		p += tocopy;
@@ -628,7 +575,8 @@ int jent_read_entropy(struct rand_data *ec, unsigned char *data,
  ***************************************************************************/
 
 struct rand_data *jent_entropy_collector_alloc(unsigned int osr,
-					       unsigned int flags)
+					       unsigned int flags,
+					       void *hash_state)
 {
 	struct rand_data *entropy_collector;
 
@@ -655,6 +603,8 @@ struct rand_data *jent_entropy_collector_alloc(unsigned int osr,
 		osr = 1; /* minimum sampling rate is 1 */
 	entropy_collector->osr = osr;
 
+	entropy_collector->hash_state = hash_state;
+
 	/* fill the data pad with non-zero values */
 	jent_gen_entropy(entropy_collector);
 
@@ -668,7 +618,7 @@ void jent_entropy_collector_free(struct rand_data *entropy_collector)
 	jent_zfree(entropy_collector);
 }
 
-int jent_entropy_init(void)
+int jent_entropy_init(void *hash_state)
 {
 	int i;
 	__u64 delta_sum = 0;
@@ -681,6 +631,7 @@ int jent_entropy_init(void)
 
 	/* Required for RCT */
 	ec.osr = 1;
+	ec.hash_state = hash_state;
 
 	/* We could perform statistical tests here, but the problem is
 	 * that we only have a few loop counts to do testing. These
@@ -718,7 +669,7 @@ int jent_entropy_init(void)
 		/* Invoke core entropy collection logic */
 		jent_get_nstime(&time);
 		ec.prev_time = time;
-		jent_lfsr_time(&ec, time, 0, 0);
+		jent_condition_data(&ec, time, 0);
 		jent_get_nstime(&time2);
 
 		/* test whether timer works */
diff --git a/crypto/jitterentropy.h b/crypto/jitterentropy.h
index 5cc583f6bc6b8..b3890ff26a023 100644
--- a/crypto/jitterentropy.h
+++ b/crypto/jitterentropy.h
@@ -2,14 +2,18 @@
 
 extern void *jent_zalloc(unsigned int len);
 extern void jent_zfree(void *ptr);
-extern void jent_memcpy(void *dest, const void *src, unsigned int n);
 extern void jent_get_nstime(__u64 *out);
+extern int jent_hash_time(void *hash_state, __u64 time, u8 *addtl,
+			  unsigned int addtl_len, __u64 hash_loop_cnt,
+			  unsigned int stuck);
+int jent_read_random_block(void *hash_state, char *dst, unsigned int dst_len);
 
 struct rand_data;
-extern int jent_entropy_init(void);
+extern int jent_entropy_init(void *hash_state);
 extern int jent_read_entropy(struct rand_data *ec, unsigned char *data,
 			     unsigned int len);
 
 extern struct rand_data *jent_entropy_collector_alloc(unsigned int osr,
-						      unsigned int flags);
+						      unsigned int flags,
+						      void *hash_state);
 extern void jent_entropy_collector_free(struct rand_data *entropy_collector);

From 9eead121a4c3a0a2ee6244269878667e5ca0d07d Mon Sep 17 00:00:00 2001
From: Jeremy Allison <jallison@ciq.com>
Date: Wed, 4 Sep 2024 10:24:07 -0700
Subject: [PATCH 05/34] crypto: aead,cipher - zeroize key buffer after use

    I.G 9.7.B for FIPS 140-3 specifies that variables temporarily holding
    cryptographic information should be zeroized once they are no longer
    needed. Accomplish this by using kfree_sensitive for buffers that
    previously held the private key.

    Signed-off-by: Hailey Mothershead <hailmo@amazon.com>
    Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

        Back-ported from commit 23e4099bdc3c8381992f9eb975c79196d6755210
        Author: Hailey Mothershead <hailmo@amazon.com>
        Date:   Mon Apr 15 22:19:15 2024 +0000

Signed-off-by: Jeremy Allison <jallison@ciq.com>
---
 crypto/aead.c   | 3 +--
 crypto/cipher.c | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/crypto/aead.c b/crypto/aead.c
index 16991095270d2..c4ece86c45bc4 100644
--- a/crypto/aead.c
+++ b/crypto/aead.c
@@ -35,8 +35,7 @@ static int setkey_unaligned(struct crypto_aead *tfm, const u8 *key,
 	alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1);
 	memcpy(alignbuffer, key, keylen);
 	ret = crypto_aead_alg(tfm)->setkey(tfm, alignbuffer, keylen);
-	memset(alignbuffer, 0, keylen);
-	kfree(buffer);
+	kfree_sensitive(buffer);
 	return ret;
 }
 
diff --git a/crypto/cipher.c b/crypto/cipher.c
index b47141ed4a9f3..395f0c2fbb9ff 100644
--- a/crypto/cipher.c
+++ b/crypto/cipher.c
@@ -34,8 +34,7 @@ static int setkey_unaligned(struct crypto_cipher *tfm, const u8 *key,
 	alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1);
 	memcpy(alignbuffer, key, keylen);
 	ret = cia->cia_setkey(crypto_cipher_tfm(tfm), alignbuffer, keylen);
-	memset(alignbuffer, 0, keylen);
-	kfree(buffer);
+	kfree_sensitive(buffer);
 	return ret;
 
 }

From fabee01343ab1a2111d79d4ed65df8809eeae9b8 Mon Sep 17 00:00:00 2001
From: Joachim Vandersmissen <git@jvdsn.com>
Date: Thu, 28 Mar 2024 11:24:30 -0500
Subject: [PATCH 06/34] crypto: ecdh - explicitly zeroize private_key

private_key is overwritten with the key parameter passed in by the
caller (if present), or alternatively a newly generated private key.
However, it is possible that the caller provides a key (or the newly
generated key) which is shorter than the previous key. In that
scenario, some key material from the previous key would not be
overwritten. The easiest solution is to explicitly zeroize the entire
private_key array first.

Note that this patch slightly changes the behavior of this function:
previously, if the ecc_gen_privkey failed, the old private_key would
remain. Now, the private_key is always zeroized. This behavior is
consistent with the case where params.key is set and ecc_is_key_valid
fails.

Signed-off-by: Joachim Vandersmissen <git@jvdsn.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Jonathan Maple <jmaple@ciq.com>
---
 crypto/ecdh.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/crypto/ecdh.c b/crypto/ecdh.c
index af702cfefd22f..85c64f1a40df2 100644
--- a/crypto/ecdh.c
+++ b/crypto/ecdh.c
@@ -34,6 +34,8 @@ static int ecdh_set_secret(struct crypto_kpp *tfm, const void *buf,
 	    params.key_size > sizeof(u64) * ctx->ndigits)
 		return -EINVAL;
 
+	memset(ctx->private_key, 0, sizeof(ctx->private_key));
+
 	if (!params.key || !params.key_size)
 		return ecc_gen_privkey(ctx->curve_id, ctx->ndigits,
 				       ctx->private_key);

From da17ae6cbb6f640b783b7714f12512c0359fa4da Mon Sep 17 00:00:00 2001
From: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
Date: Thu, 14 Dec 2023 11:08:34 +0800
Subject: [PATCH 07/34] crypto: lib/mpi - Fix unexpected pointer access in
 mpi_ec_init

[ Upstream commit ba3c5574203034781ac4231acf117da917efcd2a ]

When the mpi_ec_ctx structure is initialized, some fields are not
cleared, causing a crash when referencing the field when the
structure was released. Initially, this issue was ignored because
memory for mpi_ec_ctx is allocated with the __GFP_ZERO flag.
For example, this error will be triggered when calculating the
Za value for SM2 separately.

Fixes: d58bb7e55a8a ("lib/mpi: Introduce ec implementation to MPI library")
Cc: stable@vger.kernel.org # v6.5
Signed-off-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Jonathan Maple <jmaple@ciq.com>
---
 lib/mpi/ec.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lib/mpi/ec.c b/lib/mpi/ec.c
index 40f5908e57a4f..e16dca1e23d52 100644
--- a/lib/mpi/ec.c
+++ b/lib/mpi/ec.c
@@ -584,6 +584,9 @@ void mpi_ec_init(struct mpi_ec_ctx *ctx, enum gcry_mpi_ec_models model,
 	ctx->a = mpi_copy(a);
 	ctx->b = mpi_copy(b);
 
+	ctx->d = NULL;
+	ctx->t.two_inv_p = NULL;
+
 	ctx->t.p_barrett = use_barrett > 0 ? mpi_barrett_init(ctx->p, 0) : NULL;
 
 	mpi_ec_get_reset(ctx);

From bb3817740c0abfdd8c9b7e49bf43c96b0f29bb28 Mon Sep 17 00:00:00 2001
From: Sultan Alsawaf <sultan@ciq.com>
Date: Tue, 22 Jul 2025 15:47:52 -0700
Subject: [PATCH 08/34] crypto: Kconfig - Make CRYPTO_FIPS depend on the DRBG
 being built-in

When FIPS mode is enabled (via fips=1), there is an absolute need for the
DRBG to be available. This is at odds with the fact that the DRBG can be
built as a module when in FIPS mode, leaving critical RNG functionality at
the whims of userspace.

Userspace could simply rmmod the DRBG module, or not provide it at all and
thus a different stdrng algorithm could be used without anyone noticing.

Additionally, when running a FIPS-enabled userspace, modprobe itself may
perform a getrandom() syscall _before_ loading a given module. As a result,
there's a possible deadlock scenario where the RNG core (crypto/rng.c)
initializes _before_ the DRBG, thereby installing its getrandom() override
without having an stdrng algorithm available. Then, when userspace calls
getrandom() which redirects to the override in crypto/rng.c,
crypto_alloc_rng("stdrng") invokes the UMH (modprobe) to load the DRBG
(which is aliased to stdrng). And *then* that modprobe invocation gets
stuck at getrandom() because there's no stdrng algorithm available!

There are too many risks that come with allowing the DRBG and RNG core to
be modular for FIPS mode. Therefore, make CRYPTO_FIPS require the DRBG to
be built-in, which in turn makes the DRBG require the RNG core to be
built-in. That way, it's guaranteed for these drivers to be built-in when
running in FIPS mode.

Also clean up the CRYPTO_FIPS option name and remove the CRYPTO_ANSI_CPRNG
dependency since it's obsolete for FIPS now.

Signed-off-by: Sultan Alsawaf <sultan@ciq.com>
---
 crypto/Kconfig | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 2385d0064017a..a25806a8c9006 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -23,12 +23,12 @@ if CRYPTO
 comment "Crypto core or helper"
 
 config CRYPTO_FIPS
-	bool "FIPS 200 compliance"
-	depends on (CRYPTO_ANSI_CPRNG || CRYPTO_DRBG) && !CRYPTO_MANAGER_DISABLE_TESTS
+	bool "FIPS compliance"
+	depends on CRYPTO_DRBG=y && !CRYPTO_MANAGER_DISABLE_TESTS
 	depends on (MODULE_SIG || !MODULES)
 	help
 	  This option enables the fips boot option which is
-	  required if you want the system to operate in a FIPS 200
+	  required if you want the system to operate in a FIPS
 	  certification.  You should say no unless you know what
 	  this is.
 

From 05da74b06bd6d7f40d086dada39adf2d033dc042 Mon Sep 17 00:00:00 2001
From: Sultan Alsawaf <sultan@ciq.com>
Date: Fri, 1 Aug 2025 15:19:15 -0700
Subject: [PATCH 09/34] random: Restrict extrng registration to init time

It is technically a risk to permit extrng registration by modules after
kernel init completes. Since there is only one user of the extrng interface
and it is imperative that it is the _only_ registered extrng for FIPS
compliance, restrict the extrng registration interface to only permit
registration during kernel init and only from built-in drivers.

This also eliminates the risks associated with the extrng interface itself
being designed to solely accommodate a single registration, which would
therefore permit the registered extrng to be overridden or even removed by
an unrelated module.

Signed-off-by: Sultan Alsawaf <sultan@ciq.com>
---
 crypto/rng.c           |  9 +-----
 drivers/char/random.c  | 70 ++++++++----------------------------------
 include/linux/random.h | 16 +++++-----
 3 files changed, 20 insertions(+), 75 deletions(-)

diff --git a/crypto/rng.c b/crypto/rng.c
index c650678106a7f..a076f0878eb37 100644
--- a/crypto/rng.c
+++ b/crypto/rng.c
@@ -322,8 +322,7 @@ static ssize_t crypto_devrandom_read_iter(struct iov_iter *iter, bool reseed)
 }
 
 static const struct random_extrng crypto_devrandom_rng = {
-	.extrng_read_iter = crypto_devrandom_read_iter,
-	.owner = THIS_MODULE,
+	.extrng_read_iter = crypto_devrandom_read_iter
 };
 
 static int __init crypto_rng_init(void)
@@ -333,13 +332,7 @@ static int __init crypto_rng_init(void)
 	return 0;
 }
 
-static void __exit crypto_rng_exit(void)
-{
-	random_unregister_extrng();
-}
-
 late_initcall(crypto_rng_init);
-module_exit(crypto_rng_exit);
 
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("Random Number Generator");
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 317a0b15dc34c..5fe3118a3c278 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -51,7 +51,6 @@
 #include <linux/completion.h>
 #include <linux/uuid.h>
 #include <linux/uaccess.h>
-#include <linux/rcupdate.h>
 #include <linux/suspend.h>
 #include <linux/siphash.h>
 #include <linux/sched/isolation.h>
@@ -314,7 +313,7 @@ static void crng_fast_key_erasure(u8 key[CHACHA_KEY_SIZE],
 /*
  * Hook for external RNG.
  */
-static const struct random_extrng __rcu *extrng;
+static const struct random_extrng *extrng __ro_after_init;
 
 /*
  * This function returns a ChaCha state that you may use for generating
@@ -966,18 +965,12 @@ void __init add_bootloader_randomness(const void *buf, size_t len)
 		credit_init_bits(len * 8);
 }
 
-void random_register_extrng(const struct random_extrng *rng)
+void __init random_register_extrng(const struct random_extrng *rng)
 {
-	rcu_assign_pointer(extrng, rng);
+	/* Don't allow the registered extrng to be overridden */
+	BUG_ON(extrng);
+	extrng = rng;
 }
-EXPORT_SYMBOL_GPL(random_register_extrng);
-
-void random_unregister_extrng(void)
-{
-	RCU_INIT_POINTER(extrng, NULL);
-	synchronize_rcu();
-}
-EXPORT_SYMBOL_GPL(random_unregister_extrng);
 
 #if IS_ENABLED(CONFIG_VMGENID)
 static BLOCKING_NOTIFIER_HEAD(vmfork_chain);
@@ -1386,7 +1379,6 @@ static void __cold try_to_generate_entropy(void)
 
 SYSCALL_DEFINE3(getrandom, char __user *, ubuf, size_t, len, unsigned int, flags)
 {
-	const struct random_extrng *rng;
 	struct iov_iter iter;
 	struct iovec iov;
 	int ret;
@@ -1404,19 +1396,11 @@ SYSCALL_DEFINE3(getrandom, char __user *, ubuf, size_t, len, unsigned int, flags
 	if (len > INT_MAX)
 		len = INT_MAX;
 
-	rcu_read_lock();
-	rng = rcu_dereference(extrng);
-	if (rng && !try_module_get(rng->owner))
-		rng = NULL;
-	rcu_read_unlock();
-
-	if (rng) {
+	if (extrng) {
 		ret = import_single_range(READ, ubuf, len, &iov, &iter);
 		if (unlikely(ret))
 			return ret;
-		ret = rng->extrng_read_iter(&iter, !!(flags & GRND_RANDOM));
-		module_put(rng->owner);
-		return ret;
+		return extrng->extrng_read_iter(&iter, !!(flags & GRND_RANDOM));
 	}
 
 	if (!crng_ready() && !(flags & GRND_INSECURE)) {
@@ -1589,52 +1573,24 @@ static int random_fasync(int fd, struct file *filp, int on)
 
 static int random_open(struct inode *inode, struct file *filp)
 {
-	const struct random_extrng *rng;
-
-	rcu_read_lock();
-	rng = rcu_dereference(extrng);
-	if (rng && !try_module_get(rng->owner))
-		rng = NULL;
-	rcu_read_unlock();
-
-	if (!rng)
-		return 0;
-
-	filp->f_op = &extrng_random_fops;
-	filp->private_data = rng->owner;
+	if (extrng)
+		filp->f_op = &extrng_random_fops;
 
 	return 0;
 }
 
 static int urandom_open(struct inode *inode, struct file *filp)
 {
-	const struct random_extrng *rng;
+	if (extrng)
+		filp->f_op = &extrng_urandom_fops;
 
-	rcu_read_lock();
-	rng = rcu_dereference(extrng);
-	if (rng && !try_module_get(rng->owner))
-		rng = NULL;
-	rcu_read_unlock();
-
-	if (!rng)
-		return 0;
-
-	filp->f_op = &extrng_urandom_fops;
-	filp->private_data = rng->owner;
-
-	return 0;
-}
-
-static int extrng_release(struct inode *inode, struct file *filp)
-{
-	module_put(filp->private_data);
 	return 0;
 }
 
 static ssize_t
 extrng_read_iter(struct kiocb *kiocb, struct iov_iter *iter)
 {
-	return rcu_dereference_raw(extrng)->extrng_read_iter(iter, false);
+	return extrng->extrng_read_iter(iter, false);
 }
 
 const struct file_operations random_fops = {
@@ -1670,7 +1626,6 @@ static const struct file_operations extrng_random_fops = {
 	.unlocked_ioctl = random_ioctl,
 	.fasync = random_fasync,
 	.llseek = noop_llseek,
-	.release = extrng_release,
 	.splice_read = generic_file_splice_read,
 	.splice_write = iter_file_splice_write,
 };
@@ -1682,7 +1637,6 @@ static const struct file_operations extrng_urandom_fops = {
 	.unlocked_ioctl = random_ioctl,
 	.fasync = random_fasync,
 	.llseek = noop_llseek,
-	.release = extrng_release,
 	.splice_read = generic_file_splice_read,
 	.splice_write = iter_file_splice_write,
 };
diff --git a/include/linux/random.h b/include/linux/random.h
index d4cabe51e9434..9bde794ec8d93 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -9,12 +9,6 @@
 
 #include <uapi/linux/random.h>
 
-struct iov_iter;
-struct random_extrng {
-	ssize_t (*extrng_read_iter)(struct iov_iter *, bool reseed);
-	struct module *owner;
-};
-
 struct notifier_block;
 
 void add_device_randomness(const void *buf, size_t len);
@@ -42,9 +36,6 @@ static inline int register_random_vmfork_notifier(struct notifier_block *nb) { r
 static inline int unregister_random_vmfork_notifier(struct notifier_block *nb) { return 0; }
 #endif
 
-void random_register_extrng(const struct random_extrng *rng);
-void random_unregister_extrng(void);
-
 void get_random_bytes(void *buf, size_t len);
 u8 get_random_u8(void);
 u16 get_random_u16(void);
@@ -173,6 +164,13 @@ int random_online_cpu(unsigned int cpu);
 
 #ifndef MODULE
 extern const struct file_operations random_fops, urandom_fops;
+
+struct iov_iter;
+struct random_extrng {
+	ssize_t (*extrng_read_iter)(struct iov_iter *iter, bool reseed);
+};
+
+void __init random_register_extrng(const struct random_extrng *rng);
 #endif
 
 #endif /* _LINUX_RANDOM_H */

From 53ac1812dc96aa0cd2a7132bb7839b71d54ffb3b Mon Sep 17 00:00:00 2001
From: Sultan Alsawaf <sultan@ciq.com>
Date: Tue, 24 Jun 2025 15:16:34 -0700
Subject: [PATCH 10/34] crypto: rng - Convert crypto_default_rng_refcnt into an
 unsigned int

There is no reason this refcount should be a signed int. Convert it to an
unsigned int, thereby also making it less likely to ever overflow.

Signed-off-by: Sultan Alsawaf <sultan@ciq.com>
---
 crypto/rng.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/crypto/rng.c b/crypto/rng.c
index a076f0878eb37..108404df25be5 100644
--- a/crypto/rng.c
+++ b/crypto/rng.c
@@ -31,7 +31,7 @@ static struct crypto_rng *crypto_reseed_rng;
 static ____cacheline_aligned_in_smp DEFINE_MUTEX(crypto_default_rng_lock);
 struct crypto_rng *crypto_default_rng;
 EXPORT_SYMBOL_GPL(crypto_default_rng);
-static int crypto_default_rng_refcnt;
+static unsigned int crypto_default_rng_refcnt;
 
 int crypto_rng_reset(struct crypto_rng *tfm, const u8 *seed, unsigned int slen)
 {
@@ -164,7 +164,7 @@ void crypto_put_default_rng(void)
 EXPORT_SYMBOL_GPL(crypto_put_default_rng);
 
 #if defined(CONFIG_CRYPTO_RNG) || defined(CONFIG_CRYPTO_RNG_MODULE)
-static int crypto_del_rng(struct crypto_rng **rngp, int *refcntp,
+static int crypto_del_rng(struct crypto_rng **rngp, unsigned int *refcntp,
 		      struct mutex *lock)
 {
 	int err = -EBUSY;

From eb1998aede060dddfcc658130b39097a334054a6 Mon Sep 17 00:00:00 2001
From: Sultan Alsawaf <sultan@ciq.com>
Date: Wed, 11 Jun 2025 14:16:35 -0700
Subject: [PATCH 11/34] crypto: drbg - Align buffers to at least a cache line

None of the ciphers used by the DRBG have an alignment requirement; thus,
they all return 0 from .crypto_init, resulting in inconsistent alignment
across all buffers.

Align all buffers to at least a cache line to improve performance. This is
especially useful when multiple DRBG instances are used, since it prevents
false sharing of cache lines between the different instances.

Signed-off-by: Sultan Alsawaf <sultan@ciq.com>
---
 crypto/drbg.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/crypto/drbg.c b/crypto/drbg.c
index accf425de57f7..d14cc09b5d399 100644
--- a/crypto/drbg.c
+++ b/crypto/drbg.c
@@ -1283,6 +1283,12 @@ static inline int drbg_alloc_state(struct drbg_state *drbg)
 	if (ret < 0)
 		goto err;
 
+	/*
+	 * Align to at least a cache line for better performance. This also
+	 * prevents false sharing of cache lines between different instances.
+	 */
+	ret = max(ret, L1_CACHE_BYTES - 1);
+
 	drbg->Vbuf = kmalloc(drbg_statelen(drbg) + ret, GFP_KERNEL);
 	if (!drbg->Vbuf) {
 		ret = -ENOMEM;

From 492949b0a5c84b8335d141ad4fdb2e1861100952 Mon Sep 17 00:00:00 2001
From: Sultan Alsawaf <sultan@ciq.com>
Date: Tue, 24 Jun 2025 15:31:00 -0700
Subject: [PATCH 12/34] crypto: rng - Fix priority inversions due to mutex
 locks

Since crypto_devrandom_read_iter() is invoked directly by user tasks and is
accessible by every task in the system, there are glaring priority
inversions on crypto_reseed_rng_lock and crypto_default_rng_lock.

Tasks of arbitrary scheduling priority access crypto_devrandom_read_iter().
When a low-priority task owns one of the mutex locks, higher-priority tasks
waiting on that mutex lock are stalled until the low-priority task is done.

Fix the priority inversions by converting the mutex locks into rt_mutex
locks which have PI support.

Signed-off-by: Sultan Alsawaf <sultan@ciq.com>
---
 crypto/rng.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/crypto/rng.c b/crypto/rng.c
index 108404df25be5..ad32a3d62d673 100644
--- a/crypto/rng.c
+++ b/crypto/rng.c
@@ -14,8 +14,8 @@
 #include <linux/fips.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/mutex.h>
 #include <linux/random.h>
+#include <linux/rtmutex.h>
 #include <linux/seq_file.h>
 #include <linux/sched.h>
 #include <linux/sched/signal.h>
@@ -26,9 +26,9 @@
 
 #include "internal.h"
 
-static ____cacheline_aligned_in_smp DEFINE_MUTEX(crypto_reseed_rng_lock);
+static ____cacheline_aligned_in_smp DEFINE_RT_MUTEX(crypto_reseed_rng_lock);
 static struct crypto_rng *crypto_reseed_rng;
-static ____cacheline_aligned_in_smp DEFINE_MUTEX(crypto_default_rng_lock);
+static ____cacheline_aligned_in_smp DEFINE_RT_MUTEX(crypto_default_rng_lock);
 struct crypto_rng *crypto_default_rng;
 EXPORT_SYMBOL_GPL(crypto_default_rng);
 static unsigned int crypto_default_rng_refcnt;
@@ -145,11 +145,11 @@ int crypto_get_default_rng(void)
 {
 	int err;
 
-	mutex_lock(&crypto_default_rng_lock);
+	rt_mutex_lock(&crypto_default_rng_lock);
 	err = crypto_get_rng(&crypto_default_rng);
 	if (!err)
 		crypto_default_rng_refcnt++;
-	mutex_unlock(&crypto_default_rng_lock);
+	rt_mutex_unlock(&crypto_default_rng_lock);
 
 	return err;
 }
@@ -157,19 +157,19 @@ EXPORT_SYMBOL_GPL(crypto_get_default_rng);
 
 void crypto_put_default_rng(void)
 {
-	mutex_lock(&crypto_default_rng_lock);
+	rt_mutex_lock(&crypto_default_rng_lock);
 	crypto_default_rng_refcnt--;
-	mutex_unlock(&crypto_default_rng_lock);
+	rt_mutex_unlock(&crypto_default_rng_lock);
 }
 EXPORT_SYMBOL_GPL(crypto_put_default_rng);
 
 #if defined(CONFIG_CRYPTO_RNG) || defined(CONFIG_CRYPTO_RNG_MODULE)
 static int crypto_del_rng(struct crypto_rng **rngp, unsigned int *refcntp,
-		      struct mutex *lock)
+			  struct rt_mutex *lock)
 {
 	int err = -EBUSY;
 
-	mutex_lock(lock);
+	rt_mutex_lock(lock);
 	if (refcntp && *refcntp)
 		goto out;
 
@@ -179,7 +179,7 @@ static int crypto_del_rng(struct crypto_rng **rngp, unsigned int *refcntp,
 	err = 0;
 
 out:
-	mutex_unlock(lock);
+	rt_mutex_unlock(lock);
 
 	return err;
 }
@@ -264,7 +264,7 @@ static ssize_t crypto_devrandom_read_iter(struct iov_iter *iter, bool reseed)
 		 * a separate mutex (drbg->drbg_mutex) around the
 		 * reseed-and-generate operation.
 		 */
-		mutex_lock(&crypto_reseed_rng_lock);
+		rt_mutex_lock(&crypto_reseed_rng_lock);
 
 		/* If crypto_default_rng is not set, it will be seeded
 		 * at creation in __crypto_get_default_rng and thus no
@@ -275,7 +275,7 @@ static ssize_t crypto_devrandom_read_iter(struct iov_iter *iter, bool reseed)
 
 		ret = crypto_get_rng(&crypto_reseed_rng);
 		if (ret) {
-			mutex_unlock(&crypto_reseed_rng_lock);
+			rt_mutex_unlock(&crypto_reseed_rng_lock);
 			return ret;
 		}
 
@@ -314,7 +314,7 @@ static ssize_t crypto_devrandom_read_iter(struct iov_iter *iter, bool reseed)
 	}
 
 	if (reseed)
-		mutex_unlock(&crypto_reseed_rng_lock);
+		rt_mutex_unlock(&crypto_reseed_rng_lock);
 	else
 		crypto_put_default_rng();
 	memzero_explicit(tmp, sizeof(tmp));

From 2adf60d97e6a2d0b3215314736076964d12f435f Mon Sep 17 00:00:00 2001
From: Sultan Alsawaf <sultan@ciq.com>
Date: Wed, 18 Jun 2025 23:42:08 -0700
Subject: [PATCH 13/34] mm/gup: reintroduce pin_user_pages_fast_only()

Like pin_user_pages_fast(), but with the internal-only FOLL_FAST_ONLY flag.

This complements the get_user_pages*() API, which already has
get_user_pages_fast_only().

Note that pin_user_pages_fast_only() used to exist but was removed in
upstream commit edad1bb1fbf7 ("mm/gup: remove pin_user_pages_fast_only()")
due to it not having any users.

Signed-off-by: Sultan Alsawaf <sultan@ciq.com>
---
 include/linux/mm.h |  2 ++
 mm/gup.c           | 14 ++++++++++++++
 2 files changed, 16 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2c8e3db24a537..68868785c9b7d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2577,6 +2577,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages,
 			unsigned int gup_flags, struct page **pages);
 int pin_user_pages_fast(unsigned long start, int nr_pages,
 			unsigned int gup_flags, struct page **pages);
+int pin_user_pages_fast_only(unsigned long start, int nr_pages,
+			     unsigned int gup_flags, struct page **pages);
 void folio_add_pin(struct folio *folio);
 
 int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc);
diff --git a/mm/gup.c b/mm/gup.c
index a935faad1a0e5..c4c40fc195c51 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -3491,6 +3491,20 @@ int pin_user_pages_fast(unsigned long start, int nr_pages,
 }
 EXPORT_SYMBOL_GPL(pin_user_pages_fast);
 
+/*
+ * This is the FOLL_PIN equivalent of get_user_pages_fast_only(). Behavior is
+ * the same, except that this one sets FOLL_PIN instead of FOLL_GET.
+ */
+int pin_user_pages_fast_only(unsigned long start, int nr_pages,
+			     unsigned int gup_flags, struct page **pages)
+{
+	if (!is_valid_gup_args(pages, NULL, &gup_flags,
+			       FOLL_PIN | FOLL_FAST_ONLY))
+		return -EINVAL;
+	return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
+}
+EXPORT_SYMBOL_GPL(pin_user_pages_fast_only);
+
 /**
  * pin_user_pages_remote() - pin pages of a remote process
  *

From 64f557526971ba59820a02bdd09729afa56e38c7 Mon Sep 17 00:00:00 2001
From: Sultan Alsawaf <sultan@ciq.com>
Date: Tue, 19 Aug 2025 11:30:03 -0700
Subject: [PATCH 14/34] crypto: rng - Implement fast per-CPU DRBG instances

When the kernel is booted with fips=1, the RNG exposed to userspace is
hijacked away from the CRNG and redirects to crypto_devrandom_read_iter(),
which utilizes the DRBG.

Notably, crypto_devrandom_read_iter() maintains just two global DRBG
instances _for the entire system_, and the two instances serve separate
request types: one instance for GRND_RANDOM requests (crypto_reseed_rng),
and one instance for non-GRND_RANDOM requests (crypto_default_rng). So in
essence, for requests of a single type, there is just one global RNG for
all CPUs in the entire system, which scales _very_ poorly.

To make matters worse, the temporary buffer used to ferry data between the
DRBG and userspace is woefully small at only 256 bytes, which doesn't do a
good job of maximizing throughput from the DRBG. This results in lost
performance when userspace requests >256 bytes; it is observed that DRBG
throughput improves by 70% on an i9-13900H when the buffer size is
increased to 4096 bytes (one page). Going beyond the size of one page up to
the DRBG maximum request limit of 65536 bytes produces diminishing returns
of only 3% improved throughput in comparison. And going below the size of
one page produces progressively less throughput at each power of 2: there's
a 5% loss going from 4096 bytes to 2048 bytes and a 9% loss going from 2048
bytes to 1024 bytes.

Thus, this implements per-CPU DRBG instances utilizing a page-sized buffer
for each CPU to utilize the DRBG itself more effectively. On top of that,
for non-GRND_RANDOM requests, the DRBG's operations now occur under a local
lock that disables preemption on non-PREEMPT_RT kernels, which not only
keeps each CPU's DRBG instance isolated from another, but also improves
temporal cache locality while the DRBG actively generates a new string of
random bytes.

Prefaulting one user destination page at a time is also employed to prevent
a DRBG instance from getting blocked on page faults, thereby maximizing the
use of the DRBG so that the only bottleneck is the DRBG itself.

Signed-off-by: Sultan Alsawaf <sultan@ciq.com>
---
 crypto/rng.c | 452 ++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 373 insertions(+), 79 deletions(-)

diff --git a/crypto/rng.c b/crypto/rng.c
index ad32a3d62d673..5652a21cc7cd3 100644
--- a/crypto/rng.c
+++ b/crypto/rng.c
@@ -6,6 +6,9 @@
  *
  * Copyright (c) 2008 Neil Horman <nhorman@tuxdriver.com>
  * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * Copyright (C) 2025 Ctrl IQ, Inc.
+ * Author: Sultan Alsawaf <sultan@ciq.com>
  */
 
 #include <linux/atomic.h>
@@ -17,7 +20,6 @@
 #include <linux/random.h>
 #include <linux/rtmutex.h>
 #include <linux/seq_file.h>
-#include <linux/sched.h>
 #include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/string.h>
@@ -26,13 +28,39 @@
 
 #include "internal.h"
 
-static ____cacheline_aligned_in_smp DEFINE_RT_MUTEX(crypto_reseed_rng_lock);
-static struct crypto_rng *crypto_reseed_rng;
 static ____cacheline_aligned_in_smp DEFINE_RT_MUTEX(crypto_default_rng_lock);
 struct crypto_rng *crypto_default_rng;
 EXPORT_SYMBOL_GPL(crypto_default_rng);
 static unsigned int crypto_default_rng_refcnt;
 
+/*
+ * Per-CPU RNG instances are only used by crypto_devrandom_rng. The global RNG,
+ * crypto_default_rng, is only used directly by other drivers.
+ *
+ * Per-CPU instances of the DRBG are efficient because the DRBG itself supports
+ * an arbitrary number of instances and can be seeded on a per-CPU basis.
+ *
+ * Specifically, the DRBG is seeded by the CRNG and the Jitter RNG. The CRNG is
+ * globally accessible and is already per-CPU. And while the Jitter RNG _isn't_
+ * per-CPU, creating a DRBG instance also creates a Jitter RNG instance;
+ * therefore, per-CPU DRBG instances implies per-CPU Jitter RNG instances.
+ */
+struct cpu_rng_inst {
+	local_lock_t lock;
+	struct rt_mutex mlock;
+	struct crypto_rng *rng;
+	void *page;
+};
+
+static DEFINE_PER_CPU_ALIGNED(struct cpu_rng_inst, pcpu_default_rng) = {
+	.lock = INIT_LOCAL_LOCK(pcpu_default_rng.lock),
+	.mlock = __RT_MUTEX_INITIALIZER(pcpu_default_rng.mlock)
+};
+static DEFINE_PER_CPU_ALIGNED(struct cpu_rng_inst, pcpu_reseed_rng) = {
+	/* The reseed instances don't use the local lock */
+	.mlock = __RT_MUTEX_INITIALIZER(pcpu_reseed_rng.mlock)
+};
+
 int crypto_rng_reset(struct crypto_rng *tfm, const u8 *seed, unsigned int slen)
 {
 	struct crypto_alg *alg = tfm->base.__crt_alg;
@@ -164,32 +192,18 @@ void crypto_put_default_rng(void)
 EXPORT_SYMBOL_GPL(crypto_put_default_rng);
 
 #if defined(CONFIG_CRYPTO_RNG) || defined(CONFIG_CRYPTO_RNG_MODULE)
-static int crypto_del_rng(struct crypto_rng **rngp, unsigned int *refcntp,
-			  struct rt_mutex *lock)
+int crypto_del_default_rng(void)
 {
-	int err = -EBUSY;
-
-	rt_mutex_lock(lock);
-	if (refcntp && *refcntp)
-		goto out;
-
-	crypto_free_rng(*rngp);
-	*rngp = NULL;
-
-	err = 0;
-
-out:
-	rt_mutex_unlock(lock);
+	bool busy;
 
-	return err;
-}
+	rt_mutex_lock(&crypto_default_rng_lock);
+	if (!(busy = crypto_default_rng_refcnt)) {
+		crypto_free_rng(crypto_default_rng);
+		crypto_default_rng = NULL;
+	}
+	rt_mutex_unlock(&crypto_default_rng_lock);
 
-int crypto_del_default_rng(void)
-{
-	return crypto_del_rng(&crypto_default_rng, &crypto_default_rng_refcnt,
-			      &crypto_default_rng_lock) ?:
-	       crypto_del_rng(&crypto_reseed_rng, NULL,
-			      &crypto_reseed_rng_lock);
+	return busy ? -EBUSY : 0;
 }
 EXPORT_SYMBOL_GPL(crypto_del_default_rng);
 #endif
@@ -244,80 +258,338 @@ void crypto_unregister_rngs(struct rng_alg *algs, int count)
 }
 EXPORT_SYMBOL_GPL(crypto_unregister_rngs);
 
-static ssize_t crypto_devrandom_read_iter(struct iov_iter *iter, bool reseed)
+/*
+ * On non-PREEMPT_RT kernels, local locks disable preemption. When there's no
+ * rng allocated, one must be allocated by calling crypto_get_rng(), which can
+ * sleep. Therefore, crypto_get_rng() cannot be called under local_lock(), so if
+ * our CPU's RNG instance doesn't have an rng allocated, we drop the local lock
+ * and take a mutex lock instead. After the local lock is dropped, the current
+ * task can be freely migrated to another CPU, which means that calling
+ * local_lock() again might not result in the same instance getting locked as
+ * before. That's why this function exists: to loop on calling local_lock() and
+ * allocating an rng as needed with crypto_get_rng() until the current CPU's
+ * instance is found to have an rng allocated. If crypto_get_rng() ever fails,
+ * this function returns an error even if there are instances for other CPUs
+ * which _do_ have an rng allocated.
+ */
+static __always_inline struct cpu_rng_inst *
+lock_default_rng(struct crypto_rng **rng) __acquires(&cri->lock)
 {
-	struct crypto_rng *rng;
-	u8 tmp[256];
-	ssize_t ret;
+	struct cpu_rng_inst __percpu *pcri = &pcpu_default_rng;
+	struct cpu_rng_inst *cri;
+	int ret;
+
+	while (1) {
+		local_lock(&pcri->lock);
+		cri = this_cpu_ptr(pcri);
+		/*
+		 * cri->rng can only transition from NULL to non-NULL. This may
+		 * occur on a different CPU, thus cri->rng must be read
+		 * atomically to prevent data races; this elides mlock by
+		 * pairing with the WRITE_ONCE() in the slow path below.
+		 *
+		 * And if cri->rng is non-NULL, then it is good to go. To avoid
+		 * data races due to load speculation on torn cri->rng loads
+		 * _after_ the NULL check, one of the following is required:
+		 * 	1. smp_acquire__after_ctrl_dep() in the if-statement
+		 * 	2. All cri->rng reads are performed with READ_ONCE()
+		 * 	3. cri->rng is never read again outside this function
+		 *
+		 * Option #3 yields the best performance, so this function
+		 * provides the rng pointer as an output for the caller to use.
+		 */
+		*rng = READ_ONCE(cri->rng);
+		if (likely(*rng))
+			return cri;
+
+		/*
+		 * Slow path: there's no rng currently allocated to this instance.
+		 * Release the local lock and acquire this instance's mlock to
+		 * perform the allocation.
+		 *
+		 * Note that this task may be migrated to a different CPU now!
+		 */
+		local_unlock(&cri->lock);
+		rt_mutex_lock(&cri->mlock);
+		if (!cri->rng) {
+			struct crypto_rng *new_rng = NULL;
+
+			ret = crypto_get_rng(&new_rng);
+			if (ret) {
+				rt_mutex_unlock(&cri->mlock);
+				break;
+			}
 
-	if (unlikely(!iov_iter_count(iter)))
-		return 0;
+			/*
+			 * Pairs with READ_ONCE() above, because we might not be
+			 * on the same CPU anymore as when we first got `cri`.
+			 */
+			WRITE_ONCE(cri->rng, new_rng);
+		}
+		rt_mutex_unlock(&cri->mlock);
+	}
 
-	if (reseed) {
-		u32 flags = 0;
+	/*
+	 * Even if this task got migrated to another CPU that _does_ have an rng
+	 * allocated, just bail out if crypto_get_rng() ever fails in order to
+	 * avoid looping forever.
+	 */
+	return ERR_PTR(ret);
+}
 
-		/* If reseeding is requested, acquire a lock on
-		 * crypto_reseed_rng so it is not swapped out until
-		 * the initial random bytes are generated.
-		 *
-		 * The algorithm implementation is also protected with
-		 * a separate mutex (drbg->drbg_mutex) around the
-		 * reseed-and-generate operation.
+static __always_inline struct cpu_rng_inst *
+lock_reseed_rng(struct crypto_rng **rng) __acquires(&cri->mlock)
+{
+	struct cpu_rng_inst __percpu *pcri = &pcpu_reseed_rng;
+	struct cpu_rng_inst *cri;
+	int ret;
+
+	/*
+	 * Use whichever CPU this task is currently running on, knowing full
+	 * well that the task can freely migrate to other CPUs. The reseed RNG
+	 * requires holding a lock across the entire devrandom read, so that
+	 * another task cannot extract entropy from the same seed. In other
+	 * words, when reseeding is requested, reseeding must be done every time
+	 * every time mlock is acquired.
+	 */
+	cri = raw_cpu_ptr(pcri);
+	rt_mutex_lock(&cri->mlock);
+	if (likely(cri->rng)) {
+		/*
+		 * Since this rng instance wasn't just allocated, it needs to be
+		 * explicitly reseeded. New rng instances are seeded on creation
+		 * in crypto_get_rng() and thus don't need explicit reseeding.
 		 */
-		rt_mutex_lock(&crypto_reseed_rng_lock);
+		crypto_tfm_set_flags(crypto_rng_tfm(cri->rng),
+				     CRYPTO_TFM_REQ_NEED_RESEED);
+	} else {
+		ret = crypto_get_rng(&cri->rng);
+		if (ret) {
+			rt_mutex_unlock(&cri->mlock);
+			return ERR_PTR(ret);
+		}
+	}
 
-		/* If crypto_default_rng is not set, it will be seeded
-		 * at creation in __crypto_get_default_rng and thus no
-		 * reseeding is needed.
-		 */
-		if (crypto_reseed_rng)
-			flags |= CRYPTO_TFM_REQ_NEED_RESEED;
+	*rng = cri->rng;
+	return cri;
+}
 
-		ret = crypto_get_rng(&crypto_reseed_rng);
-		if (ret) {
-			rt_mutex_unlock(&crypto_reseed_rng_lock);
-			return ret;
+#define lock_local_rng(rng, reseed) \
+	({ (reseed) ? lock_reseed_rng(rng) : lock_default_rng(rng); })
+
+#define unlock_local_rng(cri, reseed) \
+do {						\
+	if (reseed)				\
+		rt_mutex_unlock(&(cri)->mlock);	\
+	else					\
+		local_unlock(&(cri)->lock);	\
+} while (0)
+
+static __always_inline void
+clear_rng_page(struct cpu_rng_inst *cri, size_t count)
+{
+	/* For zeroing a whole page, clear_page() is faster than memset() */
+	count < PAGE_SIZE ? memset(cri->page, 0, count) : clear_page(cri->page);
+}
+
+static ssize_t crypto_devrandom_read_iter(struct iov_iter *iter, bool reseed)
+{
+	/* lock_local_rng() puts us in atomic context for !reseed on non-RT */
+	const bool atomic = !reseed && !IS_ENABLED(CONFIG_PREEMPT_RT);
+	const bool user_no_reseed = !reseed && user_backed_iter(iter);
+	size_t ulen, page_dirty_len = 0;
+	struct cpu_rng_inst *cri;
+	struct crypto_rng *rng;
+	void __user *uaddr;
+	struct page *upage;
+	ssize_t ret = 0;
+
+	if (unlikely(!iov_iter_count(iter)))
+		return 0;
+
+	/* Set up the starting user destination address and length */
+	if (user_no_reseed) {
+		if (iter_is_ubuf(iter)) {
+			uaddr = iter->ubuf + iter->iov_offset;
+			ulen = iov_iter_count(iter);
+		} else if (iter_is_iovec(iter)) {
+			uaddr = iter_iov_addr(iter);
+			ulen = iter_iov_len(iter);
+		} else {
+			/*
+			 * ITER_UBUF and ITER_IOVEC are the only user-backed
+			 * iters. Bug out if a new user-backed iter appears.
+			 */
+			BUG();
 		}
+	}
 
-		rng = crypto_reseed_rng;
-		crypto_tfm_set_flags(crypto_rng_tfm(rng), flags);
-	} else {
-		ret = crypto_get_default_rng();
-		if (ret)
-			return ret;
-		rng = crypto_default_rng;
+restart:
+	/*
+	 * Pin the user page backing the current user destination address,
+	 * potentially prefaulting to allocate a page for the destination. By
+	 * prefaulting without the RNG lock held, the DRBG won't be blocked by
+	 * time spent on page faults for this task, and thus the DRBG can still
+	 * be used by other tasks.
+	 */
+	if (user_no_reseed && pin_user_pages_fast((unsigned long)uaddr, 1,
+						  FOLL_WRITE, &upage) != 1)
+		goto exit;
+
+	cri = lock_local_rng(&rng, reseed);
+	if (IS_ERR(cri)) {
+		if (!ret)
+			ret = PTR_ERR(cri);
+		goto unpin_upage;
 	}
 
-	for (;;) {
-		size_t i, copied;
+	while (1) {
+		size_t copied, i = min(iov_iter_count(iter), PAGE_SIZE);
+		bool resched_without_lock = false;
 		int err;
 
-		i = min_t(size_t, iov_iter_count(iter), sizeof(tmp));
-		err = crypto_rng_get_bytes(rng, tmp, i);
+		/*
+		 * Generate up to one page at a time, and align to a page
+		 * boundary so we only need to pin one user page at a time.
+		 */
+		if (user_no_reseed)
+			i = min3(i, PAGE_SIZE - offset_in_page(uaddr), ulen);
+
+		/*
+		 * On non-PREEMPT_RT kernels, local locks disable preemption.
+		 * The DRBG's generate() function has a mutex lock, which could
+		 * mean that we'll schedule while atomic if the mutex lock
+		 * sleeps. However, that will never happen if we ensure that
+		 * there's never any contention on the DRBG's mutex lock while
+		 * we're atomic! Our local lock ensures calls to the DRBG are
+		 * always serialized, so there's no contention from here. And
+		 * the DRBG only uses its mutex lock from one other path, when
+		 * an instance of the DRBG is freshly allocated, which we only
+		 * do from crypto_get_rng(). So the DRBG's mutex lock is
+		 * guaranteed to not have contention when we call generate() and
+		 * thus it'll never sleep here. And of course, nothing else in
+		 * generate() ever sleeps.
+		 */
+		err = crypto_rng_get_bytes(rng, cri->page, i);
 		if (err) {
-			ret = ret ?: err;
+			if (!ret)
+				ret = err;
 			break;
 		}
 
-		copied = copy_to_iter(tmp, i, iter);
-		ret += copied;
+		/*
+		 * Record the number of bytes used in cri->page and either copy
+		 * directly to the user address without faulting, or copy to the
+		 * iter which is always backed by kernel memory when !reseed &&
+		 * !user_backed_iter(). When reseed == true, the iter may be
+		 * backed by user memory, but we copy to it with the possibility
+		 * of page faults anyway because we need to hold the lock across
+		 * the entire call; this is why a mutex is used instead of a
+		 * local lock for the reseed RNG, to permit sleeping without
+		 * yielding the DRBG instance.
+		 */
+		page_dirty_len = max(i, page_dirty_len);
+		if (user_no_reseed) {
+			err = copy_to_user_nofault(uaddr, cri->page, i);
+			if (err >= 0) {
+				iov_iter_advance(iter, i - err);
+				ret += i - err;
+			}
+			if (err)
+				break;
+		} else {
+			/*
+			 * We know that copying from cri->page is safe, so use
+			 * _copy_to_iter() directly to skip check_copy_size().
+			 */
+			copied = _copy_to_iter(cri->page, i, iter);
+			ret += copied;
+			if (copied != i)
+				break;
+		}
 
-		if (!iov_iter_count(iter) || copied != i)
+		/*
+		 * Quit when either the requested number of bytes have been
+		 * generated or there is a pending signal.
+		 */
+		if (!iov_iter_count(iter) || signal_pending(current))
 			break;
 
-		BUILD_BUG_ON(PAGE_SIZE % sizeof(tmp) != 0);
-		if (ret % PAGE_SIZE == 0) {
-			if (signal_pending(current))
-				break;
-			cond_resched();
+		/* Compute the next user destination address and length */
+		if (user_no_reseed) {
+			ulen -= i;
+			if (likely(ulen)) {
+				uaddr += i;
+			} else {
+				/*
+				 * This path is only reachable by ITER_IOVEC
+				 * because ulen is initialized to the request
+				 * size for ITER_UBUF, and therefore ITER_UBUF
+				 * will always quit at the iov_iter_count()
+				 * check above before ulen can become zero.
+				 *
+				 * iter->iov_offset is guaranteed to be zero
+				 * here, so iter_iov_{addr|len}() isn't needed.
+				 */
+				uaddr = iter_iov(iter)->iov_base;
+				ulen = iter_iov(iter)->iov_len;
+			}
+
+			unpin_user_page(upage);
+		}
+
+		/*
+		 * Reschedule right now if needed and we're not atomic. If we're
+		 * atomic, then we must first drop the lock to reschedule.
+		 */
+		if (need_resched()) {
+			if (atomic)
+				resched_without_lock = true;
+			else
+				cond_resched();
+		}
+
+		/*
+		 * Optimistically try to pin the next user page without
+		 * faulting, so we don't need to clear cri->page and drop the
+		 * lock on every iteration. If this fails, we fall back to
+		 * pinning with the option to prefault.
+		 */
+		if (user_no_reseed && !resched_without_lock &&
+		    pin_user_pages_fast_only((unsigned long)uaddr, 1,
+					     FOLL_WRITE, &upage) == 1)
+			continue;
+
+		/*
+		 * Restart if either rescheduling is needed (and requires
+		 * dropping the lock since we're atomic) or the optimistic page
+		 * pinning attempt failed.
+		 *
+		 * This always implies `reseed == false`, so unlock_local_rng()
+		 * can just be passed `false` for reseed to eliminate a branch.
+		 */
+		if (resched_without_lock || user_no_reseed) {
+			/*
+			 * Clear the buffer of our latest random bytes before
+			 * unlocking and potentially migrating CPUs, in which
+			 * case we wouldn't have the same `cri` anymore.
+			 */
+			clear_rng_page(cri, page_dirty_len);
+			unlock_local_rng(cri, false);
+			page_dirty_len = 0;
+			if (resched_without_lock)
+				cond_resched();
+			goto restart;
 		}
 	}
 
-	if (reseed)
-		rt_mutex_unlock(&crypto_reseed_rng_lock);
-	else
-		crypto_put_default_rng();
-	memzero_explicit(tmp, sizeof(tmp));
+	if (page_dirty_len)
+		clear_rng_page(cri, page_dirty_len);
+	unlock_local_rng(cri, reseed);
+unpin_upage:
+	if (user_no_reseed)
+		unpin_user_page(upage);
+exit:
 	return ret ? ret : -EFAULT;
 }
 
@@ -325,10 +597,32 @@ static const struct random_extrng crypto_devrandom_rng = {
 	.extrng_read_iter = crypto_devrandom_read_iter
 };
 
+static void __init alloc_pcpu_inst(struct cpu_rng_inst __percpu *pcri)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct cpu_rng_inst *cri = per_cpu_ptr(pcri, cpu);
+
+		cri->page = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
+		local_lock_init(&cri->lock);
+	}
+}
+
 static int __init crypto_rng_init(void)
 {
-	if (fips_enabled)
-		random_register_extrng(&crypto_devrandom_rng);
+	if (!fips_enabled)
+		return 0;
+
+	/*
+	 * Never fail to register the RNG override in FIPS mode because failure
+	 * would result in the system quietly booting without the FIPS-mandated
+	 * RNG installed. This would be catastrophic for FIPS compliance, hence
+	 * the RNG override setup is *not* allowed to fail.
+	 */
+	alloc_pcpu_inst(&pcpu_default_rng);
+	alloc_pcpu_inst(&pcpu_reseed_rng);
+	random_register_extrng(&crypto_devrandom_rng);
 	return 0;
 }
 

From 109c727a4f1995a158abf53be81ee5c0273ef798 Mon Sep 17 00:00:00 2001
From: Jonathan Maple <jmaple@ciq.com>
Date: Tue, 29 Jul 2025 12:12:38 -0400
Subject: [PATCH 15/34] configs: Ensure FIPS settings defined

We want to hard set the x86_64 FIPS required configs rather than rely on
default settings in the kernel, should these ever change without our
knowing it would not be something we would have actively checked.

The configs are a limited set of configs that is expanded out when
building using `make olddefconfig` a common practice in kernel building.

Note had to manually add the following since its normaly set by the RPM
build process.
CONFIG_CRYPTO_FIPS_NAME="Rocky Linux 9 Kernel Cryptographic API"

Signed-off-by: Jonathan Maple <jmaple@ciq.com>
---
 configs/kernel-x86_64-debug-rhel.config | 11 +++++++++++
 configs/kernel-x86_64-rhel.config       | 11 +++++++++++
 2 files changed, 22 insertions(+)

diff --git a/configs/kernel-x86_64-debug-rhel.config b/configs/kernel-x86_64-debug-rhel.config
index 77b391b9d43f3..35f1ec83b4985 100644
--- a/configs/kernel-x86_64-debug-rhel.config
+++ b/configs/kernel-x86_64-debug-rhel.config
@@ -7263,3 +7263,14 @@ CONFIG_ZSWAP=y
 # CONFIG_ZSWAP_ZPOOL_DEFAULT_Z3FOLD is not set
 CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD=y
 # CONFIG_ZSWAP_ZPOOL_DEFAULT_ZSMALLOC is not set
+
+CONFIG_X509_CERTIFICATE_PARSER=y
+CONFIG_PKCS7_MESSAGE_PARSER=y
+CONFIG_FIPS_SIGNATURE_SELFTEST=y
+CONFIG_FIPS_SIGNATURE_SELFTEST_RSA=y
+CONFIG_FIPS_SIGNATURE_SELFTEST_ECDSA=y
+CONFIG_CRYPTO_DRBG=y
+CONFIG_CRYPTO_FIPS=y
+CONFIG_CRYPTO_FIPS_CUSTOM_VERSION=y
+CONFIG_CRYPTO_FIPS_VERSION="rocky9.20250725"
+CONFIG_CRYPTO_FIPS_NAME="Rocky Linux 9 Kernel Cryptographic API"
diff --git a/configs/kernel-x86_64-rhel.config b/configs/kernel-x86_64-rhel.config
index d15a210df8fbe..d5eeea148e5c0 100644
--- a/configs/kernel-x86_64-rhel.config
+++ b/configs/kernel-x86_64-rhel.config
@@ -7240,3 +7240,14 @@ CONFIG_ZSWAP=y
 # CONFIG_ZSWAP_ZPOOL_DEFAULT_Z3FOLD is not set
 CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD=y
 # CONFIG_ZSWAP_ZPOOL_DEFAULT_ZSMALLOC is not set
+
+CONFIG_X509_CERTIFICATE_PARSER=y
+CONFIG_PKCS7_MESSAGE_PARSER=y
+CONFIG_FIPS_SIGNATURE_SELFTEST=y
+CONFIG_FIPS_SIGNATURE_SELFTEST_RSA=y
+CONFIG_FIPS_SIGNATURE_SELFTEST_ECDSA=y
+CONFIG_CRYPTO_DRBG=y
+CONFIG_CRYPTO_FIPS=y
+CONFIG_CRYPTO_FIPS_CUSTOM_VERSION=y
+CONFIG_CRYPTO_FIPS_VERSION="rocky9.20250725"
+CONFIG_CRYPTO_FIPS_NAME="Rocky Linux 9 Kernel Cryptographic API"

From 764e85738c7acd122fa36ee40293041ab1682069 Mon Sep 17 00:00:00 2001
From: Brett Mastbergen <bmastbergen@ciq.com>
Date: Fri, 17 Oct 2025 12:52:16 -0400
Subject: [PATCH 16/34] github actions: Use reusable validate kernel commits
 workflow

Simplifies the workflow to use the reusable workflow defined in main
branch. This reduces duplication and makes the workflow easier to
maintain across multiple branches.

The workflow was renamed because it now includes validation over
and above just checking for upstream fixes

Signed-off-by: Jonathan Maple <jmaple@ciq.com>
---
 .github/workflows/validate-kernel-commits.yml | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 .github/workflows/validate-kernel-commits.yml

diff --git a/.github/workflows/validate-kernel-commits.yml b/.github/workflows/validate-kernel-commits.yml
new file mode 100644
index 0000000000000..c74434336e251
--- /dev/null
+++ b/.github/workflows/validate-kernel-commits.yml
@@ -0,0 +1,10 @@
+name: Validate Kernel Commits
+
+on:
+  pull_request:
+    types: [opened, synchronize, reopened]
+
+jobs:
+  check:
+    uses: ctrliq/kernel-src-tree/.github/workflows/validate-kernel-commits.yml@main
+    secrets: inherit

From d56b3948a23de271b455e484a7747f3ce64168f3 Mon Sep 17 00:00:00 2001
From: Jonathan Maple <jmaple@ciq.com>
Date: Mon, 9 Jun 2025 15:49:43 -0400
Subject: [PATCH 17/34] tools: hv: Enable debug logs for hv_kvp_daemon

jira LE-3207
feature tools_hv
commit-author Shradha Gupta <shradhagupta@linux.microsoft.com>
commit a9c0b33ef2306327dd2db02c6274107065ff9307

Allow the KVP daemon to log the KVP updates triggered in the VM
with a new debug flag(-d).
When the daemon is started with this flag, it logs updates and debug
information in syslog with loglevel LOG_DEBUG. This information comes
in handy for debugging issues where the key-value pairs for certain
pools show mismatch/incorrect values.
The distro-vendors can further consume these changes and modify the
respective service files to redirect the logs to specific files as
needed.

	Signed-off-by: Shradha Gupta <shradhagupta@linux.microsoft.com>
	Reviewed-by: Naman Jain <namjain@linux.microsoft.com>
	Reviewed-by: Dexuan Cui <decui@microsoft.com>
Link: https://lore.kernel.org/r/1744715978-8185-1-git-send-email-shradhagupta@linux.microsoft.com
	Signed-off-by: Wei Liu <wei.liu@kernel.org>
Message-ID: <1744715978-8185-1-git-send-email-shradhagupta@linux.microsoft.com>
(cherry picked from commit a9c0b33ef2306327dd2db02c6274107065ff9307)
	Signed-off-by: Jonathan Maple <jmaple@ciq.com>
---
 tools/hv/hv_kvp_daemon.c | 64 ++++++++++++++++++++++++++++++++++++----
 1 file changed, 59 insertions(+), 5 deletions(-)

diff --git a/tools/hv/hv_kvp_daemon.c b/tools/hv/hv_kvp_daemon.c
index 1e6fd6ca513bd..0e0c997134ec6 100644
--- a/tools/hv/hv_kvp_daemon.c
+++ b/tools/hv/hv_kvp_daemon.c
@@ -77,6 +77,7 @@ enum {
 };
 
 static int in_hand_shake;
+static int debug;
 
 static char *os_name = "";
 static char *os_major = "";
@@ -172,6 +173,20 @@ static void kvp_update_file(int pool)
 	kvp_release_lock(pool);
 }
 
+static void kvp_dump_initial_pools(int pool)
+{
+	int i;
+
+	syslog(LOG_DEBUG, "===Start dumping the contents of pool %d ===\n",
+	       pool);
+
+	for (i = 0; i < kvp_file_info[pool].num_records; i++)
+		syslog(LOG_DEBUG, "pool: %d, %d/%d key=%s val=%s\n",
+		       pool, i + 1, kvp_file_info[pool].num_records,
+		       kvp_file_info[pool].records[i].key,
+		       kvp_file_info[pool].records[i].value);
+}
+
 static void kvp_update_mem_state(int pool)
 {
 	FILE *filep;
@@ -259,6 +274,8 @@ static int kvp_file_init(void)
 			return 1;
 		kvp_file_info[i].num_records = 0;
 		kvp_update_mem_state(i);
+		if (debug)
+			kvp_dump_initial_pools(i);
 	}
 
 	return 0;
@@ -286,6 +303,9 @@ static int kvp_key_delete(int pool, const __u8 *key, int key_size)
 		 * Found a match; just move the remaining
 		 * entries up.
 		 */
+		if (debug)
+			syslog(LOG_DEBUG, "%s: deleting the KVP: pool=%d key=%s val=%s",
+			       __func__, pool, record[i].key, record[i].value);
 		if (i == (num_records - 1)) {
 			kvp_file_info[pool].num_records--;
 			kvp_update_file(pool);
@@ -304,20 +324,36 @@ static int kvp_key_delete(int pool, const __u8 *key, int key_size)
 		kvp_update_file(pool);
 		return 0;
 	}
+
+	if (debug)
+		syslog(LOG_DEBUG, "%s: could not delete KVP: pool=%d key=%s. Record not found",
+		       __func__, pool, key);
+
 	return 1;
 }
 
 static int kvp_key_add_or_modify(int pool, const __u8 *key, int key_size,
 				 const __u8 *value, int value_size)
 {
-	int i;
-	int num_records;
 	struct kvp_record *record;
+	int num_records;
 	int num_blocks;
+	int i;
+
+	if (debug)
+		syslog(LOG_DEBUG, "%s: got a KVP: pool=%d key=%s val=%s",
+		       __func__, pool, key, value);
 
 	if ((key_size > HV_KVP_EXCHANGE_MAX_KEY_SIZE) ||
-		(value_size > HV_KVP_EXCHANGE_MAX_VALUE_SIZE))
+		(value_size > HV_KVP_EXCHANGE_MAX_VALUE_SIZE)) {
+		syslog(LOG_ERR, "%s: Too long key or value: key=%s, val=%s",
+		       __func__, key, value);
+
+		if (debug)
+			syslog(LOG_DEBUG, "%s: Too long key or value: pool=%d, key=%s, val=%s",
+			       __func__, pool, key, value);
 		return 1;
+	}
 
 	/*
 	 * First update the in-memory state.
@@ -337,6 +373,9 @@ static int kvp_key_add_or_modify(int pool, const __u8 *key, int key_size,
 		 */
 		memcpy(record[i].value, value, value_size);
 		kvp_update_file(pool);
+		if (debug)
+			syslog(LOG_DEBUG, "%s: updated: pool=%d key=%s val=%s",
+			       __func__, pool, key, value);
 		return 0;
 	}
 
@@ -348,8 +387,10 @@ static int kvp_key_add_or_modify(int pool, const __u8 *key, int key_size,
 		record = realloc(record, sizeof(struct kvp_record) *
 			 ENTRIES_PER_BLOCK * (num_blocks + 1));
 
-		if (record == NULL)
+		if (!record) {
+			syslog(LOG_ERR, "%s: Memory alloc failure", __func__);
 			return 1;
+		}
 		kvp_file_info[pool].num_blocks++;
 
 	}
@@ -357,6 +398,11 @@ static int kvp_key_add_or_modify(int pool, const __u8 *key, int key_size,
 	memcpy(record[i].key, key, key_size);
 	kvp_file_info[pool].records = record;
 	kvp_file_info[pool].num_records++;
+
+	if (debug)
+		syslog(LOG_DEBUG, "%s: added: pool=%d key=%s val=%s",
+		       __func__, pool, key, value);
+
 	kvp_update_file(pool);
 	return 0;
 }
@@ -1355,6 +1401,7 @@ void print_usage(char *argv[])
 	fprintf(stderr, "Usage: %s [options]\n"
 		"Options are:\n"
 		"  -n, --no-daemon        stay in foreground, don't daemonize\n"
+		"  -d, --debug            Enable debug logs(syslog debug by default)\n"
 		"  -h, --help             print this help\n", argv[0]);
 }
 
@@ -1376,10 +1423,11 @@ int main(int argc, char *argv[])
 	static struct option long_options[] = {
 		{"help",	no_argument,	   0,  'h' },
 		{"no-daemon",	no_argument,	   0,  'n' },
+		{"debug",	no_argument,	   0,  'd' },
 		{0,		0,		   0,  0   }
 	};
 
-	while ((opt = getopt_long(argc, argv, "hn", long_options,
+	while ((opt = getopt_long(argc, argv, "hnd", long_options,
 				  &long_index)) != -1) {
 		switch (opt) {
 		case 'n':
@@ -1388,6 +1436,9 @@ int main(int argc, char *argv[])
 		case 'h':
 			print_usage(argv);
 			exit(0);
+		case 'd':
+			debug = 1;
+			break;
 		default:
 			print_usage(argv);
 			exit(EXIT_FAILURE);
@@ -1410,6 +1461,9 @@ int main(int argc, char *argv[])
 	 */
 	kvp_get_domain_name(full_domain_name, sizeof(full_domain_name));
 
+	if (debug)
+		syslog(LOG_INFO, "Logging debug info in syslog(debug)");
+
 	if (kvp_file_init()) {
 		syslog(LOG_ERR, "Failed to initialize the pools");
 		exit(EXIT_FAILURE);

From d045b343405f65d1bbee7c8751eb07ee03df3ab5 Mon Sep 17 00:00:00 2001
From: Sultan Alsawaf <sultan@ciq.com>
Date: Fri, 25 Jul 2025 14:32:37 -0700
Subject: [PATCH 18/34] crypto: rng - Only allow the DRBG to register as
 "stdrng" in FIPS mode

In FIPS mode, the DRBG must take precedence over all stdrng algorithms.
The only problem standing in the way of this is that a different stdrng
algorithm could get registered and utilized before the DRBG is registered,
and since crypto_alloc_rng() only allocates an stdrng algorithm when
there's no existing allocation, this means that it's possible for the wrong
stdrng algorithm to remain in use indefinitely.

This issue is also often impossible to observe from userspace; an RNG other
than the DRBG could be used somewhere in the kernel and userspace would be
none the wiser.

To ensure this can never happen, only allow stdrng instances from the DRBG
to be registered when running in FIPS mode. This works since the previous
commit forces the DRBG to be built into the kernel when CONFIG_CRYPTO_FIPS
is enabled, so the DRBG's presence is guaranteed when fips_enabled is true.

Signed-off-by: Sultan Alsawaf <sultan@ciq.com>
---
 crypto/rng.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/crypto/rng.c b/crypto/rng.c
index 5652a21cc7cd3..011a510492580 100644
--- a/crypto/rng.c
+++ b/crypto/rng.c
@@ -32,6 +32,7 @@ static ____cacheline_aligned_in_smp DEFINE_RT_MUTEX(crypto_default_rng_lock);
 struct crypto_rng *crypto_default_rng;
 EXPORT_SYMBOL_GPL(crypto_default_rng);
 static unsigned int crypto_default_rng_refcnt;
+static bool drbg_registered __ro_after_init;
 
 /*
  * Per-CPU RNG instances are only used by crypto_devrandom_rng. The global RNG,
@@ -215,6 +216,19 @@ int crypto_register_rng(struct rng_alg *alg)
 	if (alg->seedsize > PAGE_SIZE / 8)
 		return -EINVAL;
 
+	/*
+	 * In FIPS mode, the DRBG must take precedence over all other "stdrng"
+	 * algorithms. Therefore, forbid registration of a non-DRBG stdrng in
+	 * FIPS mode. All of the DRBG's driver names are prefixed with "drbg_".
+	 * This also stops new stdrng instances from getting registered after it
+	 * is known that the DRBG is registered, so a new module can't come in
+	 * and pretend to be the DRBG. And when CONFIG_CRYPTO_FIPS is enabled,
+	 * the DRBG is built into the kernel directly; it can't be a module.
+	 */
+	if (fips_enabled && !strcmp(base->cra_name, "stdrng") &&
+	    (drbg_registered || strncmp(base->cra_driver_name, "drbg_", 5)))
+		return -EINVAL;
+
 	base->cra_type = &crypto_rng_type;
 	base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;
 	base->cra_flags |= CRYPTO_ALG_TYPE_RNG;
@@ -239,6 +253,18 @@ int crypto_register_rngs(struct rng_alg *algs, int count)
 			goto err;
 	}
 
+	/*
+	 * Track when the DRBG is registered in FIPS mode. The DRBG calls
+	 * crypto_register_rngs() to register its stdrng instances, and since
+	 * crypto_register_rng() only allows stdrng instances from the DRBG in
+	 * FIPS mode, a successful stdrng registration means it was the DRBG.
+	 * Just check the first alg in the array to see if it's called "stdrng",
+	 * since all of the DRBG's algorithms are named "stdrng". Once
+	 * drbg_registered is set to true, this if-statement is always false.
+	 */
+	if (fips_enabled && !strcmp(algs->base.cra_name, "stdrng"))
+		drbg_registered = true;
+
 	return 0;
 
 err:

From 450fdc536e36b31fcefb8dfffc1f194900cdd8f6 Mon Sep 17 00:00:00 2001
From: Shreeya Patel <spatel@ciq.com>
Date: Mon, 15 Dec 2025 12:30:15 +0000
Subject: [PATCH 19/34] PCI/MSI: Export pci_msix_prepare_desc() for dynamic
 MSI-X allocations

jira LE-4466
commit-author Shradha Gupta <shradhagupta@linux.microsoft.com>
commit 5da8a8b8090b5f79a816ba016af3a70a9d7287bf

For supporting dynamic MSI-X vector allocation by PCI controllers, enabling
the flag MSI_FLAG_PCI_MSIX_ALLOC_DYN is not enough, msix_prepare_msi_desc()
to prepare the MSI descriptor is also needed.

Export pci_msix_prepare_desc() to allow PCI controllers to support dynamic
MSI-X vector allocation.

	Signed-off-by: Shradha Gupta <shradhagupta@linux.microsoft.com>
	Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
	Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
	Reviewed-by: Saurabh Sengar <ssengar@linux.microsoft.com>
	Acked-by: Bjorn Helgaas <bhelgaas@google.com>
(cherry picked from commit 5da8a8b8090b5f79a816ba016af3a70a9d7287bf)
	Signed-off-by: Shreeya Patel <spatel@ciq.com>
---
 drivers/pci/msi/irqdomain.c | 5 +++--
 include/linux/msi.h         | 2 ++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/msi/irqdomain.c b/drivers/pci/msi/irqdomain.c
index 78990f84ac516..38db89be75b9f 100644
--- a/drivers/pci/msi/irqdomain.c
+++ b/drivers/pci/msi/irqdomain.c
@@ -202,13 +202,14 @@ static void pci_irq_unmask_msix(struct irq_data *data)
 	pci_msix_unmask(irq_data_get_msi_desc(data));
 }
 
-static void pci_msix_prepare_desc(struct irq_domain *domain, msi_alloc_info_t *arg,
-				  struct msi_desc *desc)
+void pci_msix_prepare_desc(struct irq_domain *domain, msi_alloc_info_t *arg,
+			   struct msi_desc *desc)
 {
 	/* Don't fiddle with preallocated MSI descriptors */
 	if (!desc->pci.mask_base)
 		msix_prepare_msi_desc(to_pci_dev(desc->dev), desc);
 }
+EXPORT_SYMBOL_GPL(pci_msix_prepare_desc);
 
 static const struct msi_domain_template pci_msix_template = {
 	.chip = {
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 48dd4e533b1fd..785142674f2aa 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -674,6 +674,8 @@ struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode,
 					     struct irq_domain *parent);
 u32 pci_msi_domain_get_msi_rid(struct irq_domain *domain, struct pci_dev *pdev);
 struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev);
+void pci_msix_prepare_desc(struct irq_domain *domain, msi_alloc_info_t *arg,
+			   struct msi_desc *desc);
 #else /* CONFIG_PCI_MSI */
 static inline struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev)
 {

From 36df1eb6a543da6a10f57781689950c37d88bf99 Mon Sep 17 00:00:00 2001
From: Shreeya Patel <spatel@ciq.com>
Date: Mon, 15 Dec 2025 12:31:16 +0000
Subject: [PATCH 20/34] PCI: hv: Allow dynamic MSI-X vector allocation

jira LE-4466
commit-author Shradha Gupta <shradhagupta@linux.microsoft.com>
commit ad518f2557b971976fc9d99a6a8cd2b453742bf9

Allow dynamic MSI-X vector allocation for pci_hyperv PCI controller
by adding support for the flag MSI_FLAG_PCI_MSIX_ALLOC_DYN and using
pci_msix_prepare_desc() to prepare the MSI-X descriptors.

Feature support added for both x86 and ARM64

	Signed-off-by: Shradha Gupta <shradhagupta@linux.microsoft.com>
	Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
	Reviewed-by: Saurabh Sengar <ssengar@linux.microsoft.com>
	Acked-by: Bjorn Helgaas <bhelgaas@google.com>
(cherry picked from commit ad518f2557b971976fc9d99a6a8cd2b453742bf9)
	Signed-off-by: Shreeya Patel <spatel@ciq.com>
---
 drivers/pci/controller/pci-hyperv.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index 4bee067699aca..5c5e35da5f87f 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -2063,6 +2063,7 @@ static struct irq_chip hv_msi_irq_chip = {
 static struct msi_domain_ops hv_msi_ops = {
 	.msi_prepare	= hv_msi_prepare,
 	.msi_free	= hv_msi_free,
+	.prepare_desc	= pci_msix_prepare_desc,
 };
 
 /**
@@ -2084,7 +2085,7 @@ static int hv_pcie_init_irq_domain(struct hv_pcibus_device *hbus)
 	hbus->msi_info.ops = &hv_msi_ops;
 	hbus->msi_info.flags = (MSI_FLAG_USE_DEF_DOM_OPS |
 		MSI_FLAG_USE_DEF_CHIP_OPS | MSI_FLAG_MULTI_PCI_MSI |
-		MSI_FLAG_PCI_MSIX);
+		MSI_FLAG_PCI_MSIX | MSI_FLAG_PCI_MSIX_ALLOC_DYN);
 	hbus->msi_info.handler = FLOW_HANDLER;
 	hbus->msi_info.handler_name = FLOW_NAME;
 	hbus->msi_info.data = hbus;

From 79f9252b550cbbd22c0a9e6592675d4016c01b23 Mon Sep 17 00:00:00 2001
From: Shreeya Patel <spatel@ciq.com>
Date: Mon, 15 Dec 2025 12:31:44 +0000
Subject: [PATCH 21/34] net: mana: explain irq_setup() algorithm

jira LE-4466
commit-author Yury Norov <yury.norov@gmail.com>
commit 4607617af1b4747df0284ea8c1ddcecb21cae528

Commit 91bfe210e196 ("net: mana: add a function to spread IRQs per CPUs")
added the irq_setup() function that distributes IRQs on CPUs according
to a tricky heuristic. The corresponding commit message explains the
heuristic.

Duplicate it in the source code to make available for readers without
digging git in history. Also, add more detailed explanation about how
the heuristics is implemented.

	Signed-off-by: Yury Norov <yury.norov@gmail.com>
	Signed-off-by: Shradha Gupta <shradhagupta@linux.microsoft.com>
(cherry picked from commit 4607617af1b4747df0284ea8c1ddcecb21cae528)
	Signed-off-by: Shreeya Patel <spatel@ciq.com>
---
 .../net/ethernet/microsoft/mana/gdma_main.c   | 41 +++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index a20f5eef0324e..9212d43c1026d 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -1447,6 +1447,47 @@ void mana_gd_free_res_map(struct gdma_resource *r)
 	r->size = 0;
 }
 
+/*
+ * Spread on CPUs with the following heuristics:
+ *
+ * 1. No more than one IRQ per CPU, if possible;
+ * 2. NUMA locality is the second priority;
+ * 3. Sibling dislocality is the last priority.
+ *
+ * Let's consider this topology:
+ *
+ * Node            0               1
+ * Core        0       1       2       3
+ * CPU       0   1   2   3   4   5   6   7
+ *
+ * The most performant IRQ distribution based on the above topology
+ * and heuristics may look like this:
+ *
+ * IRQ     Nodes   Cores   CPUs
+ * 0       1       0       0-1
+ * 1       1       1       2-3
+ * 2       1       0       0-1
+ * 3       1       1       2-3
+ * 4       2       2       4-5
+ * 5       2       3       6-7
+ * 6       2       2       4-5
+ * 7       2       3       6-7
+ *
+ * The heuristics is implemented as follows.
+ *
+ * The outer for_each() loop resets the 'weight' to the actual number
+ * of CPUs in the hop. Then inner for_each() loop decrements it by the
+ * number of sibling groups (cores) while assigning first set of IRQs
+ * to each group. IRQs 0 and 1 above are distributed this way.
+ *
+ * Now, because NUMA locality is more important, we should walk the
+ * same set of siblings and assign 2nd set of IRQs (2 and 3), and it's
+ * implemented by the medium while() loop. We do like this unless the
+ * number of IRQs assigned on this hop will not become equal to number
+ * of CPUs in the hop (weight == 0). Then we switch to the next hop and
+ * do the same thing.
+ */
+
 static int irq_setup(unsigned int *irqs, unsigned int len, int node)
 {
 	const struct cpumask *next, *prev = cpu_none_mask;

From 0bb789285fd8d1e0044b31433f0d9604fce10d84 Mon Sep 17 00:00:00 2001
From: Shreeya Patel <spatel@ciq.com>
Date: Mon, 15 Dec 2025 12:32:06 +0000
Subject: [PATCH 22/34] net: mana: Allow irq_setup() to skip cpus for affinity

jira LE-4466
commit-author Shradha Gupta <shradhagupta@linux.microsoft.com>
commit 845c62c543d6bd5d8b80f53835997789e4bb8e29

In order to prepare the MANA driver to allocate the MSI-X IRQs
dynamically, we need to enhance irq_setup() to allow skipping
affinitizing IRQs to the first CPU sibling group.

This would be for cases when the number of IRQs is less than or equal
to the number of online CPUs. In such cases for dynamically added IRQs
the first CPU sibling group would already be affinitized with HWC IRQ.

	Signed-off-by: Shradha Gupta <shradhagupta@linux.microsoft.com>
	Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
	Reviewed-by: Yury Norov [NVIDIA] <yury.norov@gmail.com>
(cherry picked from commit 845c62c543d6bd5d8b80f53835997789e4bb8e29)
	Signed-off-by: Shreeya Patel <spatel@ciq.com>
---
 drivers/net/ethernet/microsoft/mana/gdma_main.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index 9212d43c1026d..fde05799fe610 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -1488,7 +1488,8 @@ void mana_gd_free_res_map(struct gdma_resource *r)
  * do the same thing.
  */
 
-static int irq_setup(unsigned int *irqs, unsigned int len, int node)
+static int irq_setup(unsigned int *irqs, unsigned int len, int node,
+		     bool skip_first_cpu)
 {
 	const struct cpumask *next, *prev = cpu_none_mask;
 	cpumask_var_t cpus __free(free_cpumask_var);
@@ -1503,11 +1504,18 @@ static int irq_setup(unsigned int *irqs, unsigned int len, int node)
 		while (weight > 0) {
 			cpumask_andnot(cpus, next, prev);
 			for_each_cpu(cpu, cpus) {
+				cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu));
+				--weight;
+
+				if (unlikely(skip_first_cpu)) {
+					skip_first_cpu = false;
+					continue;
+				}
+
 				if (len-- == 0)
 					goto done;
+
 				irq_set_affinity_and_hint(*irqs++, topology_sibling_cpumask(cpu));
-				cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu));
-				--weight;
 			}
 		}
 		prev = next;
@@ -1603,7 +1611,7 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev)
 		}
 	}
 
-	err = irq_setup(irqs, (nvec - start_irq_index), gc->numa_node);
+	err = irq_setup(irqs, nvec - start_irq_index, gc->numa_node, false);
 	if (err)
 		goto free_irq;
 

From 25025a926865bf719df2c23fd673e98d58335d60 Mon Sep 17 00:00:00 2001
From: Shreeya Patel <spatel@ciq.com>
Date: Mon, 15 Dec 2025 12:50:31 +0000
Subject: [PATCH 23/34] net: mana: Allocate MSI-X vectors dynamically

jira LE-4466
commit-author Shradha Gupta <shradhagupta@linux.microsoft.com>
commit 755391121038c06cb653241aa94dcabd87179f62
upstream-diff There were conflicts seen when applying this patch
due to following commits present in our tree before this patch.
590bcf15ae4a ("net: mana: Add handler for hardware servicing events")
00c2b0fef4b5 ("net: mana: Fix warnings for missing export.h header inclusion")

Currently, the MANA driver allocates MSI-X vectors statically based on
MANA_MAX_NUM_QUEUES and num_online_cpus() values and in some cases ends
up allocating more vectors than it needs. This is because, by this time
we do not have a HW channel and do not know how many IRQs should be
allocated.

To avoid this, we allocate 1 MSI-X vector during the creation of HWC and
after getting the value supported by hardware, dynamically add the
remaining MSI-X vectors.

	Signed-off-by: Shradha Gupta <shradhagupta@linux.microsoft.com>
	Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
(cherry picked from commit 755391121038c06cb653241aa94dcabd87179f62)
	Signed-off-by: Shreeya Patel <spatel@ciq.com>
Signed-off-by: Shreeya Patel <spatel@ciq.com>
---
 .../net/ethernet/microsoft/mana/gdma_main.c   | 311 +++++++++++++-----
 include/net/mana/gdma.h                       |   6 +-
 2 files changed, 234 insertions(+), 83 deletions(-)

diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index fde05799fe610..3610c9ccd0172 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -6,6 +6,8 @@
 #include <linux/pci.h>
 #include <linux/utsname.h>
 #include <linux/version.h>
+#include <linux/msi.h>
+#include <linux/irqdomain.h>
 #include <linux/export.h>
 
 #include <net/mana/mana.h>
@@ -104,8 +106,15 @@ static int mana_gd_query_max_resources(struct pci_dev *pdev)
 		return err ? err : -EPROTO;
 	}
 
-	if (gc->num_msix_usable > resp.max_msix)
-		gc->num_msix_usable = resp.max_msix;
+	if (!pci_msix_can_alloc_dyn(pdev)) {
+		if (gc->num_msix_usable > resp.max_msix)
+			gc->num_msix_usable = resp.max_msix;
+	} else {
+		/* If dynamic allocation is enabled we have already allocated
+		 * hwc msi
+		 */
+		gc->num_msix_usable = min(resp.max_msix, num_online_cpus() + 1);
+	}
 
 	if (gc->num_msix_usable <= 1)
 		return -ENOSPC;
@@ -639,7 +648,9 @@ static int mana_gd_register_irq(struct gdma_queue *queue,
 	}
 
 	queue->eq.msix_index = msi_index;
-	gic = &gc->irq_contexts[msi_index];
+	gic = xa_load(&gc->irq_contexts, msi_index);
+	if (WARN_ON(!gic))
+		return -EINVAL;
 
 	spin_lock_irqsave(&gic->lock, flags);
 	list_add_rcu(&queue->entry, &gic->eq_list);
@@ -664,7 +675,10 @@ static void mana_gd_deregiser_irq(struct gdma_queue *queue)
 	if (WARN_ON(msix_index >= gc->num_msix_usable))
 		return;
 
-	gic = &gc->irq_contexts[msix_index];
+	gic = xa_load(&gc->irq_contexts, msix_index);
+	if (WARN_ON(!gic))
+		return;
+
 	spin_lock_irqsave(&gic->lock, flags);
 	list_for_each_entry_rcu(eq, &gic->eq_list, entry) {
 		if (queue == eq) {
@@ -1525,47 +1539,108 @@ static int irq_setup(unsigned int *irqs, unsigned int len, int node,
 	return 0;
 }
 
-static int mana_gd_setup_irqs(struct pci_dev *pdev)
+static int mana_gd_setup_dyn_irqs(struct pci_dev *pdev, int nvec)
 {
 	struct gdma_context *gc = pci_get_drvdata(pdev);
-	unsigned int max_queues_per_port;
 	struct gdma_irq_context *gic;
-	unsigned int max_irqs, cpu;
-	int start_irq_index = 1;
-	int nvec, *irqs, irq;
-	int err, i = 0, j;
+	bool skip_first_cpu = false;
+	int *irqs, irq, err, i;
 
-	cpus_read_lock();
-	max_queues_per_port = num_online_cpus();
-	if (max_queues_per_port > MANA_MAX_NUM_QUEUES)
-		max_queues_per_port = MANA_MAX_NUM_QUEUES;
+	irqs = kmalloc_array(nvec, sizeof(int), GFP_KERNEL);
+	if (!irqs)
+		return -ENOMEM;
+
+	/*
+	 * While processing the next pci irq vector, we start with index 1,
+	 * as IRQ vector at index 0 is already processed for HWC.
+	 * However, the population of irqs array starts with index 0, to be
+	 * further used in irq_setup()
+	 */
+	for (i = 1; i <= nvec; i++) {
+		gic = kzalloc(sizeof(*gic), GFP_KERNEL);
+		if (!gic) {
+			err = -ENOMEM;
+			goto free_irq;
+		}
+		gic->handler = mana_gd_process_eq_events;
+		INIT_LIST_HEAD(&gic->eq_list);
+		spin_lock_init(&gic->lock);
 
-	/* Need 1 interrupt for the Hardware communication Channel (HWC) */
-	max_irqs = max_queues_per_port + 1;
+		snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_q%d@pci:%s",
+			 i - 1, pci_name(pdev));
 
-	nvec = pci_alloc_irq_vectors(pdev, 2, max_irqs, PCI_IRQ_MSIX);
-	if (nvec < 0) {
-		cpus_read_unlock();
-		return nvec;
+		/* one pci vector is already allocated for HWC */
+		irqs[i - 1] = pci_irq_vector(pdev, i);
+		if (irqs[i - 1] < 0) {
+			err = irqs[i - 1];
+			goto free_current_gic;
+		}
+
+		err = request_irq(irqs[i - 1], mana_gd_intr, 0, gic->name, gic);
+		if (err)
+			goto free_current_gic;
+
+		xa_store(&gc->irq_contexts, i, gic, GFP_KERNEL);
 	}
-	if (nvec <= num_online_cpus())
-		start_irq_index = 0;
 
-	irqs = kmalloc_array((nvec - start_irq_index), sizeof(int), GFP_KERNEL);
-	if (!irqs) {
-		err = -ENOMEM;
-		goto free_irq_vector;
+	/*
+	 * When calling irq_setup() for dynamically added IRQs, if number of
+	 * CPUs is more than or equal to allocated MSI-X, we need to skip the
+	 * first CPU sibling group since they are already affinitized to HWC IRQ
+	 */
+	cpus_read_lock();
+	if (gc->num_msix_usable <= num_online_cpus())
+		skip_first_cpu = true;
+
+	err = irq_setup(irqs, nvec, gc->numa_node, skip_first_cpu);
+	if (err) {
+		cpus_read_unlock();
+		goto free_irq;
 	}
 
-	gc->irq_contexts = kcalloc(nvec, sizeof(struct gdma_irq_context),
-				   GFP_KERNEL);
-	if (!gc->irq_contexts) {
-		err = -ENOMEM;
-		goto free_irq_array;
+	cpus_read_unlock();
+	kfree(irqs);
+	return 0;
+
+free_current_gic:
+	kfree(gic);
+free_irq:
+	for (i -= 1; i > 0; i--) {
+		irq = pci_irq_vector(pdev, i);
+		gic = xa_load(&gc->irq_contexts, i);
+		if (WARN_ON(!gic))
+			continue;
+
+		irq_update_affinity_hint(irq, NULL);
+		free_irq(irq, gic);
+		xa_erase(&gc->irq_contexts, i);
+		kfree(gic);
 	}
+	kfree(irqs);
+	return err;
+}
+
+static int mana_gd_setup_irqs(struct pci_dev *pdev, int nvec)
+{
+	struct gdma_context *gc = pci_get_drvdata(pdev);
+	struct gdma_irq_context *gic;
+	int *irqs, *start_irqs, irq;
+	unsigned int cpu;
+	int err, i;
+
+	irqs = kmalloc_array(nvec, sizeof(int), GFP_KERNEL);
+	if (!irqs)
+		return -ENOMEM;
+
+	start_irqs = irqs;
 
 	for (i = 0; i < nvec; i++) {
-		gic = &gc->irq_contexts[i];
+		gic = kzalloc(sizeof(*gic), GFP_KERNEL);
+		if (!gic) {
+			err = -ENOMEM;
+			goto free_irq;
+		}
+
 		gic->handler = mana_gd_process_eq_events;
 		INIT_LIST_HEAD(&gic->eq_list);
 		spin_lock_init(&gic->lock);
@@ -1577,69 +1652,128 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev)
 			snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_q%d@pci:%s",
 				 i - 1, pci_name(pdev));
 
-		irq = pci_irq_vector(pdev, i);
-		if (irq < 0) {
-			err = irq;
-			goto free_irq;
+		irqs[i] = pci_irq_vector(pdev, i);
+		if (irqs[i] < 0) {
+			err = irqs[i];
+			goto free_current_gic;
 		}
 
-		if (!i) {
-			err = request_irq(irq, mana_gd_intr, 0, gic->name, gic);
-			if (err)
-				goto free_irq;
-
-			/* If number of IRQ is one extra than number of online CPUs,
-			 * then we need to assign IRQ0 (hwc irq) and IRQ1 to
-			 * same CPU.
-			 * Else we will use different CPUs for IRQ0 and IRQ1.
-			 * Also we are using cpumask_local_spread instead of
-			 * cpumask_first for the node, because the node can be
-			 * mem only.
-			 */
-			if (start_irq_index) {
-				cpu = cpumask_local_spread(i, gc->numa_node);
-				irq_set_affinity_and_hint(irq, cpumask_of(cpu));
-			} else {
-				irqs[start_irq_index] = irq;
-			}
-		} else {
-			irqs[i - start_irq_index] = irq;
-			err = request_irq(irqs[i - start_irq_index], mana_gd_intr, 0,
-					  gic->name, gic);
-			if (err)
-				goto free_irq;
-		}
+		err = request_irq(irqs[i], mana_gd_intr, 0, gic->name, gic);
+		if (err)
+			goto free_current_gic;
+
+		xa_store(&gc->irq_contexts, i, gic, GFP_KERNEL);
 	}
 
-	err = irq_setup(irqs, nvec - start_irq_index, gc->numa_node, false);
-	if (err)
+	/* If number of IRQ is one extra than number of online CPUs,
+	 * then we need to assign IRQ0 (hwc irq) and IRQ1 to
+	 * same CPU.
+	 * Else we will use different CPUs for IRQ0 and IRQ1.
+	 * Also we are using cpumask_local_spread instead of
+	 * cpumask_first for the node, because the node can be
+	 * mem only.
+	 */
+	cpus_read_lock();
+	if (nvec > num_online_cpus()) {
+		cpu = cpumask_local_spread(0, gc->numa_node);
+		irq_set_affinity_and_hint(irqs[0], cpumask_of(cpu));
+		irqs++;
+		nvec -= 1;
+	}
+
+	err = irq_setup(irqs, nvec, gc->numa_node, false);
+	if (err) {
+		cpus_read_unlock();
 		goto free_irq;
+	}
 
-	gc->max_num_msix = nvec;
-	gc->num_msix_usable = nvec;
 	cpus_read_unlock();
-	kfree(irqs);
+	kfree(start_irqs);
 	return 0;
 
+free_current_gic:
+	kfree(gic);
 free_irq:
-	for (j = i - 1; j >= 0; j--) {
-		irq = pci_irq_vector(pdev, j);
-		gic = &gc->irq_contexts[j];
+	for (i -= 1; i >= 0; i--) {
+		irq = pci_irq_vector(pdev, i);
+		gic = xa_load(&gc->irq_contexts, i);
+		if (WARN_ON(!gic))
+			continue;
 
 		irq_update_affinity_hint(irq, NULL);
 		free_irq(irq, gic);
+		xa_erase(&gc->irq_contexts, i);
+		kfree(gic);
 	}
 
-	kfree(gc->irq_contexts);
-	gc->irq_contexts = NULL;
-free_irq_array:
-	kfree(irqs);
-free_irq_vector:
-	cpus_read_unlock();
-	pci_free_irq_vectors(pdev);
+	kfree(start_irqs);
 	return err;
 }
 
+static int mana_gd_setup_hwc_irqs(struct pci_dev *pdev)
+{
+	struct gdma_context *gc = pci_get_drvdata(pdev);
+	unsigned int max_irqs, min_irqs;
+	int nvec, err;
+
+	if (pci_msix_can_alloc_dyn(pdev)) {
+		max_irqs = 1;
+		min_irqs = 1;
+	} else {
+		/* Need 1 interrupt for HWC */
+		max_irqs = min(num_online_cpus(), MANA_MAX_NUM_QUEUES) + 1;
+		min_irqs = 2;
+	}
+
+	nvec = pci_alloc_irq_vectors(pdev, min_irqs, max_irqs, PCI_IRQ_MSIX);
+	if (nvec < 0)
+		return nvec;
+
+	err = mana_gd_setup_irqs(pdev, nvec);
+	if (err) {
+		pci_free_irq_vectors(pdev);
+		return err;
+	}
+
+	gc->num_msix_usable = nvec;
+	gc->max_num_msix = nvec;
+
+	return 0;
+}
+
+static int mana_gd_setup_remaining_irqs(struct pci_dev *pdev)
+{
+	struct gdma_context *gc = pci_get_drvdata(pdev);
+	struct msi_map irq_map;
+	int max_irqs, i, err;
+
+	if (!pci_msix_can_alloc_dyn(pdev))
+		/* remain irqs are already allocated with HWC IRQ */
+		return 0;
+
+	/* allocate only remaining IRQs*/
+	max_irqs = gc->num_msix_usable - 1;
+
+	for (i = 1; i <= max_irqs; i++) {
+		irq_map = pci_msix_alloc_irq_at(pdev, i, NULL);
+		if (!irq_map.virq) {
+			err = irq_map.index;
+			/* caller will handle cleaning up all allocated
+			 * irqs, after HWC is destroyed
+			 */
+			return err;
+		}
+	}
+
+	err = mana_gd_setup_dyn_irqs(pdev, max_irqs);
+	if (err)
+		return err;
+
+	gc->max_num_msix = gc->max_num_msix + max_irqs;
+
+	return 0;
+}
+
 static void mana_gd_remove_irqs(struct pci_dev *pdev)
 {
 	struct gdma_context *gc = pci_get_drvdata(pdev);
@@ -1654,19 +1788,21 @@ static void mana_gd_remove_irqs(struct pci_dev *pdev)
 		if (irq < 0)
 			continue;
 
-		gic = &gc->irq_contexts[i];
+		gic = xa_load(&gc->irq_contexts, i);
+		if (WARN_ON(!gic))
+			continue;
 
 		/* Need to clear the hint before free_irq */
 		irq_update_affinity_hint(irq, NULL);
 		free_irq(irq, gic);
+		xa_erase(&gc->irq_contexts, i);
+		kfree(gic);
 	}
 
 	pci_free_irq_vectors(pdev);
 
 	gc->max_num_msix = 0;
 	gc->num_msix_usable = 0;
-	kfree(gc->irq_contexts);
-	gc->irq_contexts = NULL;
 }
 
 static int mana_gd_setup(struct pci_dev *pdev)
@@ -1681,9 +1817,10 @@ static int mana_gd_setup(struct pci_dev *pdev)
 	if (!gc->service_wq)
 		return -ENOMEM;
 
-	err = mana_gd_setup_irqs(pdev);
+	err = mana_gd_setup_hwc_irqs(pdev);
 	if (err) {
-		dev_err(gc->dev, "Failed to setup IRQs: %d\n", err);
+		dev_err(gc->dev, "Failed to setup IRQs for HWC creation: %d\n",
+			err);
 		goto free_workqueue;
 	}
 
@@ -1699,6 +1836,12 @@ static int mana_gd_setup(struct pci_dev *pdev)
 	if (err)
 		goto destroy_hwc;
 
+	err = mana_gd_setup_remaining_irqs(pdev);
+	if (err) {
+		dev_err(gc->dev, "Failed to setup remaining IRQs: %d", err);
+		goto destroy_hwc;
+	}
+
 	err = mana_gd_detect_devices(pdev);
 	if (err)
 		goto destroy_hwc;
@@ -1779,6 +1922,7 @@ static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	gc->is_pf = mana_is_pf(pdev->device);
 	gc->bar0_va = bar0_va;
 	gc->dev = &pdev->dev;
+	xa_init(&gc->irq_contexts);
 
 	if (gc->is_pf)
 		gc->mana_pci_debugfs = debugfs_create_dir("0", mana_debugfs_root);
@@ -1813,6 +1957,7 @@ static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	 */
 	debugfs_remove_recursive(gc->mana_pci_debugfs);
 	gc->mana_pci_debugfs = NULL;
+	xa_destroy(&gc->irq_contexts);
 	pci_iounmap(pdev, bar0_va);
 free_gc:
 	pci_set_drvdata(pdev, NULL);
@@ -1838,6 +1983,8 @@ static void mana_gd_remove(struct pci_dev *pdev)
 
 	gc->mana_pci_debugfs = NULL;
 
+	xa_destroy(&gc->irq_contexts);
+
 	pci_iounmap(pdev, gc->bar0_va);
 
 	vfree(gc);
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index 79516db61bcae..57df78cfbf82c 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -390,7 +390,7 @@ struct gdma_context {
 	unsigned int		max_num_queues;
 	unsigned int		max_num_msix;
 	unsigned int		num_msix_usable;
-	struct gdma_irq_context	*irq_contexts;
+	struct xarray		irq_contexts;
 
 	/* L2 MTU */
 	u16 adapter_mtu;
@@ -582,6 +582,9 @@ enum {
 /* Driver can handle holes (zeros) in the device list */
 #define GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP BIT(11)
 
+/* Driver supports dynamic MSI-X vector allocation */
+#define GDMA_DRV_CAP_FLAG_1_DYNAMIC_IRQ_ALLOC_SUPPORT BIT(13)
+
 /* Driver can self reset on EQE notification */
 #define GDMA_DRV_CAP_FLAG_1_SELF_RESET_ON_EQE BIT(14)
 
@@ -594,6 +597,7 @@ enum {
 	 GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG | \
 	 GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT | \
 	 GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP | \
+	 GDMA_DRV_CAP_FLAG_1_DYNAMIC_IRQ_ALLOC_SUPPORT | \
 	 GDMA_DRV_CAP_FLAG_1_SELF_RESET_ON_EQE | \
 	 GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE)
 

From 8e19d0533a0b99a59453c62b450a4eb699d53a55 Mon Sep 17 00:00:00 2001
From: Shreeya Patel <spatel@ciq.com>
Date: Tue, 16 Dec 2025 15:09:16 +0000
Subject: [PATCH 24/34] net: mana: Add support for net_shaper_ops

jira LE-4472
commit-author Erni Sri Satya Vennela <ernis@linux.microsoft.com>
commit 75cabb46935b6de8e2bdfde563e460ac41cfff12
upstream-diff There was a conflict seen when applying this
patch due to the following commit not present in our tree.
92272ec4107e ("eth: add missing xdp.h includes in drivers")

Introduce support for net_shaper_ops in the MANA driver,
enabling configuration of rate limiting on the MANA NIC.

To apply rate limiting, the driver issues a HWC command via
mana_set_bw_clamp() and updates the corresponding shaper object
in the net_shaper cache. If an error occurs during this process,
the driver restores the previous speed by querying the current link
configuration using mana_query_link_cfg().

The minimum supported bandwidth is 100 Mbps, and only values that are
exact multiples of 100 Mbps are allowed. Any other values are rejected.

To remove a shaper, the driver resets the bandwidth to the maximum
supported by the SKU using mana_set_bw_clamp() and clears the
associated cache entry. If an error occurs during this process,
the shaper details are retained.

On the hardware that does not support these APIs, the net-shaper
calls to set speed would fail.

Set the speed:
./tools/net/ynl/pyynl/cli.py \
 --spec Documentation/netlink/specs/net_shaper.yaml \
 --do set --json '{"ifindex":'$IFINDEX',
		   "handle":{"scope": "netdev", "id":'$ID' },
		   "bw-max": 200000000 }'

Get the shaper details:
./tools/net/ynl/pyynl/cli.py \
 --spec Documentation/netlink/specs/net_shaper.yaml \
 --do get --json '{"ifindex":'$IFINDEX',
		      "handle":{"scope": "netdev", "id":'$ID' }}'

> {'bw-max': 200000000,
> 'handle': {'scope': 'netdev'},
> 'ifindex': $IFINDEX,
> 'metric': 'bps'}

Delete the shaper object:
./tools/net/ynl/pyynl/cli.py \
 --spec Documentation/netlink/specs/net_shaper.yaml \
 --do delete --json '{"ifindex":'$IFINDEX',
		      "handle":{"scope": "netdev","id":'$ID' }}'

	Signed-off-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
	Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
	Reviewed-by: Shradha Gupta <shradhagupta@linux.microsoft.com>
	Reviewed-by: Saurabh Singh Sengar <ssengar@linux.microsoft.com>
	Reviewed-by: Long Li <longli@microsoft.com>
Link: https://patch.msgid.link/1750144656-2021-3-git-send-email-ernis@linux.microsoft.com
	Signed-off-by: Paolo Abeni <pabeni@redhat.com>

(cherry picked from commit 75cabb46935b6de8e2bdfde563e460ac41cfff12)
	Signed-off-by: Shreeya Patel <spatel@ciq.com>
---
 drivers/net/ethernet/microsoft/mana/mana_en.c | 155 ++++++++++++++++++
 include/net/mana/mana.h                       |  41 +++++
 2 files changed, 196 insertions(+)

diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 35cbb3a15f6d4..d897b49ba0665 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -731,6 +731,78 @@ static int mana_change_mtu(struct net_device *ndev, int new_mtu)
 	return err;
 }
 
+static int mana_shaper_set(struct net_shaper_binding *binding,
+			   const struct net_shaper *shaper,
+			   struct netlink_ext_ack *extack)
+{
+	struct mana_port_context *apc = netdev_priv(binding->netdev);
+	u32 old_speed, rate;
+	int err;
+
+	if (shaper->handle.scope != NET_SHAPER_SCOPE_NETDEV) {
+		NL_SET_ERR_MSG_MOD(extack, "net shaper scope should be netdev");
+		return -EINVAL;
+	}
+
+	if (apc->handle.id && shaper->handle.id != apc->handle.id) {
+		NL_SET_ERR_MSG_MOD(extack, "Cannot create multiple shapers");
+		return -EOPNOTSUPP;
+	}
+
+	if (!shaper->bw_max || (shaper->bw_max % 100000000)) {
+		NL_SET_ERR_MSG_MOD(extack, "Please use multiples of 100Mbps for bandwidth");
+		return -EINVAL;
+	}
+
+	rate = div_u64(shaper->bw_max, 1000); /* Convert bps to Kbps */
+	rate = div_u64(rate, 1000);	      /* Convert Kbps to Mbps */
+
+	/* Get current speed */
+	err = mana_query_link_cfg(apc);
+	old_speed = (err) ? SPEED_UNKNOWN : apc->speed;
+
+	if (!err) {
+		err = mana_set_bw_clamp(apc, rate, TRI_STATE_TRUE);
+		apc->speed = (err) ? old_speed : rate;
+		apc->handle = (err) ? apc->handle : shaper->handle;
+	}
+
+	return err;
+}
+
+static int mana_shaper_del(struct net_shaper_binding *binding,
+			   const struct net_shaper_handle *handle,
+			   struct netlink_ext_ack *extack)
+{
+	struct mana_port_context *apc = netdev_priv(binding->netdev);
+	int err;
+
+	err = mana_set_bw_clamp(apc, 0, TRI_STATE_FALSE);
+
+	if (!err) {
+		/* Reset mana port context parameters */
+		apc->handle.id = 0;
+		apc->handle.scope = NET_SHAPER_SCOPE_UNSPEC;
+		apc->speed = 0;
+	}
+
+	return err;
+}
+
+static void mana_shaper_cap(struct net_shaper_binding *binding,
+			    enum net_shaper_scope scope,
+			    unsigned long *flags)
+{
+	*flags = BIT(NET_SHAPER_A_CAPS_SUPPORT_BW_MAX) |
+		 BIT(NET_SHAPER_A_CAPS_SUPPORT_METRIC_BPS);
+}
+
+static const struct net_shaper_ops mana_shaper_ops = {
+	.set = mana_shaper_set,
+	.delete = mana_shaper_del,
+	.capabilities = mana_shaper_cap,
+};
+
 static const struct net_device_ops mana_devops = {
 	.ndo_open		= mana_open,
 	.ndo_stop		= mana_close,
@@ -741,6 +813,7 @@ static const struct net_device_ops mana_devops = {
 	.ndo_bpf		= mana_bpf,
 	.ndo_xdp_xmit		= mana_xdp_xmit,
 	.ndo_change_mtu		= mana_change_mtu,
+	.net_shaper_ops         = &mana_shaper_ops,
 };
 
 static void mana_cleanup_port_context(struct mana_port_context *apc)
@@ -1184,6 +1257,86 @@ static int mana_cfg_vport_steering(struct mana_port_context *apc,
 	return err;
 }
 
+int mana_query_link_cfg(struct mana_port_context *apc)
+{
+	struct net_device *ndev = apc->ndev;
+	struct mana_query_link_config_resp resp = {};
+	struct mana_query_link_config_req req = {};
+	int err;
+
+	mana_gd_init_req_hdr(&req.hdr, MANA_QUERY_LINK_CONFIG,
+			     sizeof(req), sizeof(resp));
+
+	req.vport = apc->port_handle;
+	req.hdr.resp.msg_version = GDMA_MESSAGE_V2;
+
+	err = mana_send_request(apc->ac, &req, sizeof(req), &resp,
+				sizeof(resp));
+
+	if (err) {
+		netdev_err(ndev, "Failed to query link config: %d\n", err);
+		return err;
+	}
+
+	err = mana_verify_resp_hdr(&resp.hdr, MANA_QUERY_LINK_CONFIG,
+				   sizeof(resp));
+
+	if (err || resp.hdr.status) {
+		netdev_err(ndev, "Failed to query link config: %d, 0x%x\n", err,
+			   resp.hdr.status);
+		if (!err)
+			err = -EOPNOTSUPP;
+		return err;
+	}
+
+	if (resp.qos_unconfigured) {
+		err = -EINVAL;
+		return err;
+	}
+	apc->speed = resp.link_speed_mbps;
+	return 0;
+}
+
+int mana_set_bw_clamp(struct mana_port_context *apc, u32 speed,
+		      int enable_clamping)
+{
+	struct mana_set_bw_clamp_resp resp = {};
+	struct mana_set_bw_clamp_req req = {};
+	struct net_device *ndev = apc->ndev;
+	int err;
+
+	mana_gd_init_req_hdr(&req.hdr, MANA_SET_BW_CLAMP,
+			     sizeof(req), sizeof(resp));
+	req.vport = apc->port_handle;
+	req.link_speed_mbps = speed;
+	req.enable_clamping = enable_clamping;
+
+	err = mana_send_request(apc->ac, &req, sizeof(req), &resp,
+				sizeof(resp));
+
+	if (err) {
+		netdev_err(ndev, "Failed to set bandwidth clamp for speed %u, err = %d",
+			   speed, err);
+		return err;
+	}
+
+	err = mana_verify_resp_hdr(&resp.hdr, MANA_SET_BW_CLAMP,
+				   sizeof(resp));
+
+	if (err || resp.hdr.status) {
+		netdev_err(ndev, "Failed to set bandwidth clamp: %d, 0x%x\n", err,
+			   resp.hdr.status);
+		if (!err)
+			err = -EOPNOTSUPP;
+		return err;
+	}
+
+	if (resp.qos_unconfigured)
+		netdev_info(ndev, "QoS is unconfigured\n");
+
+	return 0;
+}
+
 int mana_create_wq_obj(struct mana_port_context *apc,
 		       mana_handle_t vport,
 		       u32 wq_type, struct mana_obj_spec *wq_spec,
@@ -3024,6 +3177,8 @@ static int mana_probe_port(struct mana_context *ac, int port_idx,
 		goto free_indir;
 	}
 
+	debugfs_create_u32("current_speed", 0400, apc->mana_port_debugfs, &apc->speed);
+
 	return 0;
 
 free_indir:
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index 3ce29a6c1aaf5..00926585a8fe1 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -4,6 +4,8 @@
 #ifndef _MANA_H
 #define _MANA_H
 
+#include <net/net_shaper.h>
+
 #include "gdma.h"
 #include "hw_channel.h"
 
@@ -524,7 +526,12 @@ struct mana_port_context {
 	struct mutex vport_mutex;
 	int vport_use_count;
 
+	/* Net shaper handle*/
+	struct net_shaper_handle handle;
+
 	u16 port_idx;
+	/* Currently configured speed (mbps) */
+	u32 speed;
 
 	bool port_is_up;
 	bool port_st_save; /* Saved port state */
@@ -560,6 +567,9 @@ struct bpf_prog *mana_xdp_get(struct mana_port_context *apc);
 void mana_chn_setxdp(struct mana_port_context *apc, struct bpf_prog *prog);
 int mana_bpf(struct net_device *ndev, struct netdev_bpf *bpf);
 void mana_query_gf_stats(struct mana_port_context *apc);
+int mana_query_link_cfg(struct mana_port_context *apc);
+int mana_set_bw_clamp(struct mana_port_context *apc, u32 speed,
+		      int enable_clamping);
 void mana_query_phy_stats(struct mana_port_context *apc);
 int mana_pre_alloc_rxbufs(struct mana_port_context *apc, int mtu, int num_queues);
 void mana_pre_dealloc_rxbufs(struct mana_port_context *apc);
@@ -587,6 +597,8 @@ enum mana_command_code {
 	MANA_FENCE_RQ		= 0x20006,
 	MANA_CONFIG_VPORT_RX	= 0x20007,
 	MANA_QUERY_VPORT_CONFIG	= 0x20008,
+	MANA_QUERY_LINK_CONFIG	= 0x2000A,
+	MANA_SET_BW_CLAMP	= 0x2000B,
 	MANA_QUERY_PHY_STAT     = 0x2000c,
 
 	/* Privileged commands for the PF mode */
@@ -596,6 +608,35 @@ enum mana_command_code {
 	MANA_DEREGISTER_HW_PORT	= 0x28004,
 };
 
+/* Query Link Configuration*/
+struct mana_query_link_config_req {
+	struct gdma_req_hdr hdr;
+	mana_handle_t vport;
+}; /* HW DATA */
+
+struct mana_query_link_config_resp {
+	struct gdma_resp_hdr hdr;
+	u32 qos_speed_mbps;
+	u8 qos_unconfigured;
+	u8 reserved1[3];
+	u32 link_speed_mbps;
+	u8 reserved2[4];
+}; /* HW DATA */
+
+/* Set Bandwidth Clamp*/
+struct mana_set_bw_clamp_req {
+	struct gdma_req_hdr hdr;
+	mana_handle_t vport;
+	enum TRI_STATE enable_clamping;
+	u32 link_speed_mbps;
+}; /* HW DATA */
+
+struct mana_set_bw_clamp_resp {
+	struct gdma_resp_hdr hdr;
+	u8 qos_unconfigured;
+	u8 reserved[7];
+}; /* HW DATA */
+
 /* Query Device Configuration */
 struct mana_query_device_cfg_req {
 	struct gdma_req_hdr hdr;

From 07148a4a5e7b27df5d311cf1b1779a0d4b59e0db Mon Sep 17 00:00:00 2001
From: Shreeya Patel <spatel@ciq.com>
Date: Tue, 16 Dec 2025 15:12:16 +0000
Subject: [PATCH 25/34] net: mana: Add speed support in mana_get_link_ksettings

jira LE-4472
commit-author Erni Sri Satya Vennela <ernis@linux.microsoft.com>
commit a6d5edf11e0cf5a4650f1d353d20ec29de093813

Allow mana ethtool get_link_ksettings operation to report
the maximum speed supported by the SKU in mbps.

The driver retrieves this information by issuing a
HWC command to the hardware via mana_query_link_cfg(),
which retrieves the SKU's maximum supported speed.

These APIs when invoked on hardware that are older/do
not support these APIs, the speed would be reported as UNKNOWN.

Before:
$ethtool enP30832s1
> Settings for enP30832s1:
        Supported ports: [  ]
        Supported link modes:   Not reported
        Supported pause frame use: No
        Supports auto-negotiation: No
        Supported FEC modes: Not reported
        Advertised link modes:  Not reported
        Advertised pause frame use: No
        Advertised auto-negotiation: No
        Advertised FEC modes: Not reported
        Speed: Unknown!
        Duplex: Full
        Auto-negotiation: off
        Port: Other
        PHYAD: 0
        Transceiver: internal
        Link detected: yes

After:
$ethtool enP30832s1
> Settings for enP30832s1:
        Supported ports: [  ]
        Supported link modes:   Not reported
        Supported pause frame use: No
        Supports auto-negotiation: No
        Supported FEC modes: Not reported
        Advertised link modes:  Not reported
        Advertised pause frame use: No
        Advertised auto-negotiation: No
        Advertised FEC modes: Not reported
        Speed: 16000Mb/s
        Duplex: Full
        Auto-negotiation: off
        Port: Other
        PHYAD: 0
        Transceiver: internal
        Link detected: yes

	Signed-off-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
	Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
	Reviewed-by: Shradha Gupta <shradhagupta@linux.microsoft.com>
	Reviewed-by: Saurabh Singh Sengar <ssengar@linux.microsoft.com>
	Reviewed-by: Long Li <longli@microsoft.com>
Link: https://patch.msgid.link/1750144656-2021-4-git-send-email-ernis@linux.microsoft.com
	Signed-off-by: Paolo Abeni <pabeni@redhat.com>

(cherry picked from commit a6d5edf11e0cf5a4650f1d353d20ec29de093813)
	Signed-off-by: Shreeya Patel <spatel@ciq.com>
---
 drivers/net/ethernet/microsoft/mana/mana_en.c      | 1 +
 drivers/net/ethernet/microsoft/mana/mana_ethtool.c | 6 ++++++
 include/net/mana/mana.h                            | 2 ++
 3 files changed, 9 insertions(+)

diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index d897b49ba0665..8fc4fc67ab397 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -1294,6 +1294,7 @@ int mana_query_link_cfg(struct mana_port_context *apc)
 		return err;
 	}
 	apc->speed = resp.link_speed_mbps;
+	apc->max_speed = resp.qos_speed_mbps;
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
index 4fb3a04994a2d..a1afa75a94631 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
@@ -495,6 +495,12 @@ static int mana_set_ringparam(struct net_device *ndev,
 static int mana_get_link_ksettings(struct net_device *ndev,
 				   struct ethtool_link_ksettings *cmd)
 {
+	struct mana_port_context *apc = netdev_priv(ndev);
+	int err;
+
+	err = mana_query_link_cfg(apc);
+	cmd->base.speed = (err) ? SPEED_UNKNOWN : apc->max_speed;
+
 	cmd->base.duplex = DUPLEX_FULL;
 	cmd->base.port = PORT_OTHER;
 
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index 00926585a8fe1..550d8be6f79bb 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -532,6 +532,8 @@ struct mana_port_context {
 	u16 port_idx;
 	/* Currently configured speed (mbps) */
 	u32 speed;
+	/* Maximum speed supported by the SKU (mbps) */
+	u32 max_speed;
 
 	bool port_is_up;
 	bool port_st_save; /* Saved port state */

From f8f142090e610c3cce09adda0fe10c5bc7c4dbc7 Mon Sep 17 00:00:00 2001
From: Shreeya Patel <spatel@ciq.com>
Date: Tue, 16 Dec 2025 15:13:30 +0000
Subject: [PATCH 26/34] net: mana: Handle unsupported HWC commands

jira LE-4472
commit-author Erni Sri Satya Vennela <ernis@linux.microsoft.com>
commit ca8ac489ca33c986ff02ee14c3e1c10b86355428
upstream-diff There were conflicts seen when applying this
patch due to the following patch being in our tree before
this one.
7a3c23599984 ("net: mana: Handle Reset Request from MANA NIC")

If any of the HWC commands are not recognized by the
underlying hardware, the hardware returns the response
header status of -1. Log the information using
netdev_info_once to avoid multiple error logs in dmesg.

	Signed-off-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
	Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
	Reviewed-by: Shradha Gupta <shradhagupta@linux.microsoft.com>
	Reviewed-by: Saurabh Singh Sengar <ssengar@linux.microsoft.com>
	Reviewed-by: Dipayaan Roy <dipayanroy@linux.microsoft.com>
Link: https://patch.msgid.link/1750144656-2021-5-git-send-email-ernis@linux.microsoft.com
	Signed-off-by: Paolo Abeni <pabeni@redhat.com>

(cherry picked from commit ca8ac489ca33c986ff02ee14c3e1c10b86355428)
	Signed-off-by: Shreeya Patel <spatel@ciq.com>
---
 drivers/net/ethernet/microsoft/mana/mana_en.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 8fc4fc67ab397..240f99c8386db 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -1274,6 +1274,10 @@ int mana_query_link_cfg(struct mana_port_context *apc)
 				sizeof(resp));
 
 	if (err) {
+		if (err == -EOPNOTSUPP) {
+			netdev_info_once(ndev, "MANA_QUERY_LINK_CONFIG not supported\n");
+			return err;
+		}
 		netdev_err(ndev, "Failed to query link config: %d\n", err);
 		return err;
 	}
@@ -1316,6 +1320,10 @@ int mana_set_bw_clamp(struct mana_port_context *apc, u32 speed,
 				sizeof(resp));
 
 	if (err) {
+		if (err == -EOPNOTSUPP) {
+			netdev_info_once(ndev, "MANA_SET_BW_CLAMP not supported\n");
+			return err;
+		}
 		netdev_err(ndev, "Failed to set bandwidth clamp for speed %u, err = %d",
 			   speed, err);
 		return err;

From 0dc994f8dc96cb6dd55a75a767c1cc8026aa3bfc Mon Sep 17 00:00:00 2001
From: Shreeya Patel <spatel@ciq.com>
Date: Tue, 16 Dec 2025 15:15:07 +0000
Subject: [PATCH 27/34] net: mana: Fix build errors when CONFIG_NET_SHAPER is
 disabled

jira LE-4472
commit-author Erni Sri Satya Vennela <ernis@linux.microsoft.com>
commit 11cd0206987205ee05b0abd70a8eafa400ba89e3

Fix build errors when CONFIG_NET_SHAPER is disabled, including:

drivers/net/ethernet/microsoft/mana/mana_en.c:804:10: error:
'const struct net_device_ops' has no member named 'net_shaper_ops'

     804 |         .net_shaper_ops         = &mana_shaper_ops,

drivers/net/ethernet/microsoft/mana/mana_en.c:804:35: error:
initialization of 'int (*)(struct net_device *, struct neigh_parms *)'
from incompatible pointer type 'const struct net_shaper_ops *'
[-Werror=incompatible-pointer-types]

     804 |         .net_shaper_ops         = &mana_shaper_ops,

	Signed-off-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
Fixes: 75cabb46935b ("net: mana: Add support for net_shaper_ops")
	Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202506230625.bfUlqb8o-lkp@intel.com/
	Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/1750851355-8067-1-git-send-email-ernis@linux.microsoft.com
	Signed-off-by: Jakub Kicinski <kuba@kernel.org>
(cherry picked from commit 11cd0206987205ee05b0abd70a8eafa400ba89e3)
	Signed-off-by: Shreeya Patel <spatel@ciq.com>
---
 drivers/net/ethernet/microsoft/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/microsoft/Kconfig b/drivers/net/ethernet/microsoft/Kconfig
index 901fbffbf718e..3f36ee6a8ecee 100644
--- a/drivers/net/ethernet/microsoft/Kconfig
+++ b/drivers/net/ethernet/microsoft/Kconfig
@@ -22,6 +22,7 @@ config MICROSOFT_MANA
 	depends on PCI_HYPERV
 	select AUXILIARY_BUS
 	select PAGE_POOL
+	select NET_SHAPER
 	help
 	  This driver supports Microsoft Azure Network Adapter (MANA).
 	  So far, the driver is only supported on X86_64.

From 3be90eb0a44bea629b2f641535eb51449bae1edf Mon Sep 17 00:00:00 2001
From: Shreeya Patel <spatel@ciq.com>
Date: Wed, 17 Dec 2025 11:39:06 +0000
Subject: [PATCH 28/34] RDMA/mana_ib: add additional port counters

jira LE-4526
commit-author Zhiyue Qiu <zhiyueqiu@microsoft.com>
commit 084f35b84f57e059b542ea44240a51b294a096a1

Add packet and request port counters to mana_ib.

	Signed-off-by: Zhiyue Qiu <zhiyueqiu@microsoft.com>
	Signed-off-by: Konstantin Taranov <kotaranov@microsoft.com>
Link: https://patch.msgid.link/1752143395-5324-1-git-send-email-kotaranov@linux.microsoft.com
	Reviewed-by: Long Li <longli@microsoft.com>
	Signed-off-by: Leon Romanovsky <leon@kernel.org>
(cherry picked from commit 084f35b84f57e059b542ea44240a51b294a096a1)
	Signed-off-by: Shreeya Patel <spatel@ciq.com>
---
 drivers/infiniband/hw/mana/counters.c | 18 ++++++++++++++++++
 drivers/infiniband/hw/mana/counters.h |  8 ++++++++
 drivers/infiniband/hw/mana/mana_ib.h  |  8 ++++++++
 3 files changed, 34 insertions(+)

diff --git a/drivers/infiniband/hw/mana/counters.c b/drivers/infiniband/hw/mana/counters.c
index 6a81365d3b951..e964e74be48da 100644
--- a/drivers/infiniband/hw/mana/counters.c
+++ b/drivers/infiniband/hw/mana/counters.c
@@ -32,6 +32,14 @@ static const struct rdma_stat_desc mana_ib_port_stats_desc[] = {
 	[MANA_IB_RATE_INC_EVENTS].name = "rate_inc_events",
 	[MANA_IB_NUM_QPS_RECOVERED].name = "num_qps_recovered",
 	[MANA_IB_CURRENT_RATE].name = "current_rate",
+	[MANA_IB_DUP_RX_REQ].name = "dup_rx_requests",
+	[MANA_IB_TX_BYTES].name = "tx_bytes",
+	[MANA_IB_RX_BYTES].name = "rx_bytes",
+	[MANA_IB_RX_SEND_REQ].name = "rx_send_requests",
+	[MANA_IB_RX_WRITE_REQ].name = "rx_write_requests",
+	[MANA_IB_RX_READ_REQ].name = "rx_read_requests",
+	[MANA_IB_TX_PKT].name = "tx_packets",
+	[MANA_IB_RX_PKT].name = "rx_packets",
 };
 
 static const struct rdma_stat_desc mana_ib_device_stats_desc[] = {
@@ -100,6 +108,7 @@ static int mana_ib_get_hw_port_stats(struct ib_device *ibdev, struct rdma_hw_sta
 
 	mana_gd_init_req_hdr(&req.hdr, MANA_IB_QUERY_VF_COUNTERS,
 			     sizeof(req), sizeof(resp));
+	req.hdr.resp.msg_version = GDMA_MESSAGE_V2;
 	req.hdr.dev_id = mdev->gdma_dev->dev_id;
 	req.adapter = mdev->adapter_handle;
 
@@ -148,6 +157,15 @@ static int mana_ib_get_hw_port_stats(struct ib_device *ibdev, struct rdma_hw_sta
 	stats->value[MANA_IB_NUM_QPS_RECOVERED] = resp.num_qps_recovered;
 	stats->value[MANA_IB_CURRENT_RATE] = resp.current_rate;
 
+	stats->value[MANA_IB_DUP_RX_REQ] = resp.dup_rx_req;
+	stats->value[MANA_IB_TX_BYTES] = resp.tx_bytes;
+	stats->value[MANA_IB_RX_BYTES] = resp.rx_bytes;
+	stats->value[MANA_IB_RX_SEND_REQ] = resp.rx_send_req;
+	stats->value[MANA_IB_RX_WRITE_REQ] = resp.rx_write_req;
+	stats->value[MANA_IB_RX_READ_REQ] = resp.rx_read_req;
+	stats->value[MANA_IB_TX_PKT] = resp.tx_pkt;
+	stats->value[MANA_IB_RX_PKT] = resp.rx_pkt;
+
 	return ARRAY_SIZE(mana_ib_port_stats_desc);
 }
 
diff --git a/drivers/infiniband/hw/mana/counters.h b/drivers/infiniband/hw/mana/counters.h
index 987a6fee83c94..f68e776bb41d7 100644
--- a/drivers/infiniband/hw/mana/counters.h
+++ b/drivers/infiniband/hw/mana/counters.h
@@ -35,6 +35,14 @@ enum mana_ib_port_counters {
 	MANA_IB_RATE_INC_EVENTS,
 	MANA_IB_NUM_QPS_RECOVERED,
 	MANA_IB_CURRENT_RATE,
+	MANA_IB_DUP_RX_REQ,
+	MANA_IB_TX_BYTES,
+	MANA_IB_RX_BYTES,
+	MANA_IB_RX_SEND_REQ,
+	MANA_IB_RX_WRITE_REQ,
+	MANA_IB_RX_READ_REQ,
+	MANA_IB_TX_PKT,
+	MANA_IB_RX_PKT,
 };
 
 enum mana_ib_device_counters {
diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h
index eddd0a83b97ee..369825fdeff86 100644
--- a/drivers/infiniband/hw/mana/mana_ib.h
+++ b/drivers/infiniband/hw/mana/mana_ib.h
@@ -516,6 +516,14 @@ struct mana_rnic_query_vf_cntrs_resp {
 	u64 rate_inc_events;
 	u64 num_qps_recovered;
 	u64 current_rate;
+	u64 dup_rx_req;
+	u64 tx_bytes;
+	u64 rx_bytes;
+	u64 rx_send_req;
+	u64 rx_write_req;
+	u64 rx_read_req;
+	u64 tx_pkt;
+	u64 rx_pkt;
 }; /* HW Data */
 
 struct mana_rnic_query_device_cntrs_req {

From e43d6f1847552115f525b3904d8f63ca1f5beb06 Mon Sep 17 00:00:00 2001
From: Shreeya Patel <spatel@ciq.com>
Date: Wed, 17 Dec 2025 11:54:22 +0000
Subject: [PATCH 29/34] RDMA/mana_ib: Drain send wrs of GSI QP

jira LE-4523
commit-author Konstantin Taranov <kotaranov@microsoft.com>
commit 44d69d3cf2e8047c279cbb9708f05e2c43e33234

Drain send WRs of the GSI QP on device removal.

In rare servicing scenarios, the hardware may delete the
state of the GSI QP, preventing it from generating CQEs
for pending send WRs. Since WRs submitted to the GSI QP
hold CM resources, the device cannot be removed until
those WRs are completed. This patch marks all pending
send WRs as failed, allowing the GSI QP to release the CM
resources and enabling safe device removal.

	Signed-off-by: Konstantin Taranov <kotaranov@microsoft.com>
Link: https://patch.msgid.link/1753779618-23629-1-git-send-email-kotaranov@linux.microsoft.com
	Signed-off-by: Leon Romanovsky <leon@kernel.org>
(cherry picked from commit 44d69d3cf2e8047c279cbb9708f05e2c43e33234)
	Signed-off-by: Shreeya Patel <spatel@ciq.com>
---
 drivers/infiniband/hw/mana/cq.c      | 26 ++++++++++++++++++++++++++
 drivers/infiniband/hw/mana/device.c  |  3 +++
 drivers/infiniband/hw/mana/mana_ib.h |  3 +++
 3 files changed, 32 insertions(+)

diff --git a/drivers/infiniband/hw/mana/cq.c b/drivers/infiniband/hw/mana/cq.c
index 28e154bbb50f8..1becc87791235 100644
--- a/drivers/infiniband/hw/mana/cq.c
+++ b/drivers/infiniband/hw/mana/cq.c
@@ -291,6 +291,32 @@ static int mana_process_completions(struct mana_ib_cq *cq, int nwc, struct ib_wc
 	return wc_index;
 }
 
+void mana_drain_gsi_sqs(struct mana_ib_dev *mdev)
+{
+	struct mana_ib_qp *qp = mana_get_qp_ref(mdev, MANA_GSI_QPN, false);
+	struct ud_sq_shadow_wqe *shadow_wqe;
+	struct mana_ib_cq *cq;
+	unsigned long flags;
+
+	if (!qp)
+		return;
+
+	cq = container_of(qp->ibqp.send_cq, struct mana_ib_cq, ibcq);
+
+	spin_lock_irqsave(&cq->cq_lock, flags);
+	while ((shadow_wqe = shadow_queue_get_next_to_complete(&qp->shadow_sq))
+			!= NULL) {
+		shadow_wqe->header.error_code = IB_WC_GENERAL_ERR;
+		shadow_queue_advance_next_to_complete(&qp->shadow_sq);
+	}
+	spin_unlock_irqrestore(&cq->cq_lock, flags);
+
+	if (cq->ibcq.comp_handler)
+		cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
+
+	mana_put_qp_ref(qp);
+}
+
 int mana_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
 {
 	struct mana_ib_cq *cq = container_of(ibcq, struct mana_ib_cq, ibcq);
diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c
index 4f83d0f7da043..0b84d99335bff 100644
--- a/drivers/infiniband/hw/mana/device.c
+++ b/drivers/infiniband/hw/mana/device.c
@@ -225,6 +225,9 @@ static void mana_ib_remove(struct auxiliary_device *adev)
 {
 	struct mana_ib_dev *dev = dev_get_drvdata(&adev->dev);
 
+	if (mana_ib_is_rnic(dev))
+		mana_drain_gsi_sqs(dev);
+
 	ib_unregister_device(&dev->ib_dev);
 	dma_pool_destroy(dev->av_pool);
 	if (mana_ib_is_rnic(dev)) {
diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h
index 369825fdeff86..2c3400213b12f 100644
--- a/drivers/infiniband/hw/mana/mana_ib.h
+++ b/drivers/infiniband/hw/mana/mana_ib.h
@@ -43,6 +43,8 @@
  */
 #define MANA_AV_BUFFER_SIZE	64
 
+#define MANA_GSI_QPN		(1)
+
 struct mana_ib_adapter_caps {
 	u32 max_sq_id;
 	u32 max_rq_id;
@@ -716,6 +718,7 @@ int mana_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
 int mana_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
 		      const struct ib_send_wr **bad_wr);
 
+void mana_drain_gsi_sqs(struct mana_ib_dev *mdev);
 int mana_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
 int mana_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
 

From 1e2218efb81ffa8b9d1c8f634f60936d43e2c8d1 Mon Sep 17 00:00:00 2001
From: Shreeya Patel <spatel@ciq.com>
Date: Wed, 17 Dec 2025 12:07:36 +0000
Subject: [PATCH 30/34] net: hv_netvsc: fix loss of early receive events from
 host during channel open.

jira LE-4493
commit-author Dipayaan Roy <dipayanroy@linux.microsoft.com>
commit 9448ccd853368582efa9db05db344f8bb9dffe0f

The hv_netvsc driver currently enables NAPI after opening the primary and
subchannels. This ordering creates a race: if the Hyper-V host places data
in the host -> guest ring buffer and signals the channel before
napi_enable() has been called, the channel callback will run but
napi_schedule_prep() will return false. As a result, the NAPI poller never
gets scheduled, the data in the ring buffer is not consumed, and the
receive queue may remain permanently stuck until another interrupt happens
to arrive.

Fix this by enabling NAPI and registering it with the RX/TX queues before
vmbus channel is opened. This guarantees that any early host signal after
open will correctly trigger NAPI scheduling and the ring buffer will be
drained.

Fixes: 76bb5db5c749d ("netvsc: fix use after free on module removal")
	Signed-off-by: Dipayaan Roy <dipayanroy@linux.microsoft.com>
Link: https://patch.msgid.link/20250825115627.GA32189@linuxonhyperv3.guj3yctzbm1etfxqx2vob5hsef.xx.internal.cloudapp.net
	Signed-off-by: Jakub Kicinski <kuba@kernel.org>
(cherry picked from commit 9448ccd853368582efa9db05db344f8bb9dffe0f)
	Signed-off-by: Shreeya Patel <spatel@ciq.com>
---
 drivers/net/hyperv/netvsc.c       | 17 ++++++++---------
 drivers/net/hyperv/rndis_filter.c | 23 ++++++++++++++++-------
 2 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 720104661d7f2..60a4629fe6ba7 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -1812,6 +1812,11 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device,
 
 	/* Enable NAPI handler before init callbacks */
 	netif_napi_add(ndev, &net_device->chan_table[0].napi, netvsc_poll);
+	napi_enable(&net_device->chan_table[0].napi);
+	netif_queue_set_napi(ndev, 0, NETDEV_QUEUE_TYPE_RX,
+			     &net_device->chan_table[0].napi);
+	netif_queue_set_napi(ndev, 0, NETDEV_QUEUE_TYPE_TX,
+			     &net_device->chan_table[0].napi);
 
 	/* Open the channel */
 	device->channel->next_request_id_callback = vmbus_next_request_id;
@@ -1831,12 +1836,6 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device,
 	/* Channel is opened */
 	netdev_dbg(ndev, "hv_netvsc channel opened successfully\n");
 
-	napi_enable(&net_device->chan_table[0].napi);
-	netif_queue_set_napi(ndev, 0, NETDEV_QUEUE_TYPE_RX,
-			     &net_device->chan_table[0].napi);
-	netif_queue_set_napi(ndev, 0, NETDEV_QUEUE_TYPE_TX,
-			     &net_device->chan_table[0].napi);
-
 	/* Connect with the NetVsp */
 	ret = netvsc_connect_vsp(device, net_device, device_info);
 	if (ret != 0) {
@@ -1854,14 +1853,14 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device,
 
 close:
 	RCU_INIT_POINTER(net_device_ctx->nvdev, NULL);
-	netif_queue_set_napi(ndev, 0, NETDEV_QUEUE_TYPE_TX, NULL);
-	netif_queue_set_napi(ndev, 0, NETDEV_QUEUE_TYPE_RX, NULL);
-	napi_disable(&net_device->chan_table[0].napi);
 
 	/* Now, we can close the channel safely */
 	vmbus_close(device->channel);
 
 cleanup:
+	netif_queue_set_napi(ndev, 0, NETDEV_QUEUE_TYPE_TX, NULL);
+	netif_queue_set_napi(ndev, 0, NETDEV_QUEUE_TYPE_RX, NULL);
+	napi_disable(&net_device->chan_table[0].napi);
 	netif_napi_del(&net_device->chan_table[0].napi);
 
 cleanup2:
diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index 9e73959e61ee0..c35f9685b6bf0 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -1252,17 +1252,26 @@ static void netvsc_sc_open(struct vmbus_channel *new_sc)
 	new_sc->rqstor_size = netvsc_rqstor_size(netvsc_ring_bytes);
 	new_sc->max_pkt_size = NETVSC_MAX_PKT_SIZE;
 
+	/* Enable napi before opening the vmbus channel to avoid races
+	 * as the host placing data on the host->guest ring may be left
+	 * out if napi was not enabled.
+	 */
+	napi_enable(&nvchan->napi);
+	netif_queue_set_napi(ndev, chn_index, NETDEV_QUEUE_TYPE_RX,
+			     &nvchan->napi);
+	netif_queue_set_napi(ndev, chn_index, NETDEV_QUEUE_TYPE_TX,
+			     &nvchan->napi);
+
 	ret = vmbus_open(new_sc, netvsc_ring_bytes,
 			 netvsc_ring_bytes, NULL, 0,
 			 netvsc_channel_cb, nvchan);
-	if (ret == 0) {
-		napi_enable(&nvchan->napi);
-		netif_queue_set_napi(ndev, chn_index, NETDEV_QUEUE_TYPE_RX,
-				     &nvchan->napi);
-		netif_queue_set_napi(ndev, chn_index, NETDEV_QUEUE_TYPE_TX,
-				     &nvchan->napi);
-	} else {
+	if (ret != 0) {
 		netdev_notice(ndev, "sub channel open failed: %d\n", ret);
+		netif_queue_set_napi(ndev, chn_index, NETDEV_QUEUE_TYPE_TX,
+				     NULL);
+		netif_queue_set_napi(ndev, chn_index, NETDEV_QUEUE_TYPE_RX,
+				     NULL);
+		napi_disable(&nvchan->napi);
 	}
 
 	if (atomic_inc_return(&nvscdev->open_chn) == nvscdev->num_chn)

From 92c8a00d527cb630b1794a2400d968fd7bc16236 Mon Sep 17 00:00:00 2001
From: Shreeya Patel <spatel@ciq.com>
Date: Wed, 17 Dec 2025 12:16:49 +0000
Subject: [PATCH 31/34] net: mana: Reduce waiting time if HWC not responding

jira LE-4496
commit-author Haiyang Zhang <haiyangz@microsoft.com>
commit c4deabbc1abe452ea230b86d53ed3711e5a8a062

If HW Channel (HWC) is not responding, reduce the waiting time, so further
steps will fail quickly.
This will prevent getting stuck for a long time (30 minutes or more), for
example, during unloading while HWC is not responding.

	Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Link: https://patch.msgid.link/1757537841-5063-1-git-send-email-haiyangz@linux.microsoft.com
	Signed-off-by: Jakub Kicinski <kuba@kernel.org>
(cherry picked from commit c4deabbc1abe452ea230b86d53ed3711e5a8a062)
	Signed-off-by: Shreeya Patel <spatel@ciq.com>
---
 drivers/net/ethernet/microsoft/mana/hw_channel.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/microsoft/mana/hw_channel.c b/drivers/net/ethernet/microsoft/mana/hw_channel.c
index 2a3036976cab3..98d494e09989e 100644
--- a/drivers/net/ethernet/microsoft/mana/hw_channel.c
+++ b/drivers/net/ethernet/microsoft/mana/hw_channel.c
@@ -880,7 +880,12 @@ int mana_hwc_send_request(struct hw_channel_context *hwc, u32 req_len,
 	if (!wait_for_completion_timeout(&ctx->comp_event,
 					 (msecs_to_jiffies(hwc->hwc_timeout)))) {
 		if (hwc->hwc_timeout != 0)
-			dev_err(hwc->dev, "HWC: Request timed out!\n");
+			dev_err(hwc->dev, "HWC: Request timed out: %u ms\n",
+				hwc->hwc_timeout);
+
+		/* Reduce further waiting if HWC no response */
+		if (hwc->hwc_timeout > 1)
+			hwc->hwc_timeout = 1;
 
 		err = -ETIMEDOUT;
 		goto out;

From ae14eee6d32dc362a7017329e136a312241e95fa Mon Sep 17 00:00:00 2001
From: Shreeya Patel <spatel@ciq.com>
Date: Wed, 17 Dec 2025 12:19:32 +0000
Subject: [PATCH 32/34] RDMA/mana_ib: Extend modify QP

jira LE-4520
commit-author Shiraz Saleem <shirazsaleem@microsoft.com>
commit 2bd7dd383609f11330814ecc0d3c10b67073a6be

Extend modify QP to support further attributes: local_ack_timeout, UD qkey,
rate_limit, qp_access_flags, flow_label, max_rd_atomic.

	Signed-off-by: Shiraz Saleem <shirazsaleem@microsoft.com>
	Signed-off-by: Konstantin Taranov <kotaranov@microsoft.com>
Link: https://patch.msgid.link/1757923172-4475-1-git-send-email-kotaranov@linux.microsoft.com
	Signed-off-by: Leon Romanovsky <leon@kernel.org>
(cherry picked from commit 2bd7dd383609f11330814ecc0d3c10b67073a6be)
	Signed-off-by: Shreeya Patel <spatel@ciq.com>
---
 drivers/infiniband/hw/mana/mana_ib.h | 11 +++++++++--
 drivers/infiniband/hw/mana/qp.c      |  9 +++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h
index 2c3400213b12f..38ea58fe411c7 100644
--- a/drivers/infiniband/hw/mana/mana_ib.h
+++ b/drivers/infiniband/hw/mana/mana_ib.h
@@ -411,7 +411,7 @@ struct mana_ib_ah_attr {
 	u8 traffic_class;
 	u16 src_port;
 	u16 dest_port;
-	u32 reserved;
+	u32 flow_label;
 };
 
 struct mana_rnic_set_qp_state_req {
@@ -428,8 +428,15 @@ struct mana_rnic_set_qp_state_req {
 	u32 retry_cnt;
 	u32 rnr_retry;
 	u32 min_rnr_timer;
-	u32 reserved;
+	u32 rate_limit;
 	struct mana_ib_ah_attr ah_attr;
+	u64 reserved1;
+	u32 qkey;
+	u32 qp_access_flags;
+	u8 local_ack_timeout;
+	u8 max_rd_atomic;
+	u16 reserved2;
+	u32 reserved3;
 }; /* HW Data */
 
 struct mana_rnic_set_qp_state_resp {
diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c
index a6bf4d539e670..48c1f4977f218 100644
--- a/drivers/infiniband/hw/mana/qp.c
+++ b/drivers/infiniband/hw/mana/qp.c
@@ -735,6 +735,8 @@ static int mana_ib_gd_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 	int err;
 
 	mana_gd_init_req_hdr(&req.hdr, MANA_IB_SET_QP_STATE, sizeof(req), sizeof(resp));
+
+	req.hdr.req.msg_version = GDMA_MESSAGE_V3;
 	req.hdr.dev_id = mdev->gdma_dev->dev_id;
 	req.adapter = mdev->adapter_handle;
 	req.qp_handle = qp->qp_handle;
@@ -748,6 +750,12 @@ static int mana_ib_gd_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 	req.retry_cnt = attr->retry_cnt;
 	req.rnr_retry = attr->rnr_retry;
 	req.min_rnr_timer = attr->min_rnr_timer;
+	req.rate_limit = attr->rate_limit;
+	req.qkey = attr->qkey;
+	req.local_ack_timeout = attr->timeout;
+	req.qp_access_flags = attr->qp_access_flags;
+	req.max_rd_atomic = attr->max_rd_atomic;
+
 	if (attr_mask & IB_QP_AV) {
 		ndev = mana_ib_get_netdev(&mdev->ib_dev, ibqp->port);
 		if (!ndev) {
@@ -774,6 +782,7 @@ static int mana_ib_gd_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 							  ibqp->qp_num, attr->dest_qp_num);
 		req.ah_attr.traffic_class = attr->ah_attr.grh.traffic_class >> 2;
 		req.ah_attr.hop_limit = attr->ah_attr.grh.hop_limit;
+		req.ah_attr.flow_label = attr->ah_attr.grh.flow_label;
 	}
 
 	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);

From 7890822af880c8b18b2ce2e47df736fe1c654cbf Mon Sep 17 00:00:00 2001
From: Shreeya Patel <spatel@ciq.com>
Date: Wed, 17 Dec 2025 12:23:47 +0000
Subject: [PATCH 33/34] scsi: storvsc: Prefer returning channel with the same
 CPU as on the I/O issuing CPU

jira LE-4536
commit-author Long Li <longli@microsoft.com>
commit b69ffeaa0ae43892683113b3f4ddf156398738b9

When selecting an outgoing channel for I/O, storvsc tries to select a
channel with a returning CPU that is not the same as issuing CPU. This
worked well in the past, however it doesn't work well when the Hyper-V
exposes a large number of channels (up to the number of all CPUs). Use a
different CPU for returning channel is not efficient on Hyper-V.

Change this behavior by preferring to the channel with the same CPU as
the current I/O issuing CPU whenever possible.

Tests have shown improvements in newer Hyper-V/Azure environment, and no
regression with older Hyper-V/Azure environments.

	Tested-by: Raheel Abdul Faizy <rabdulfaizy@microsoft.com>
	Signed-off-by: Long Li <longli@microsoft.com>
Message-Id: <1759381530-7414-1-git-send-email-longli@linux.microsoft.com>
	Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
(cherry picked from commit b69ffeaa0ae43892683113b3f4ddf156398738b9)
	Signed-off-by: Shreeya Patel <spatel@ciq.com>
---
 drivers/scsi/storvsc_drv.c | 96 ++++++++++++++++++--------------------
 1 file changed, 45 insertions(+), 51 deletions(-)

diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index ed62e59a4e23d..efc83ce935224 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -1406,14 +1406,19 @@ static struct vmbus_channel *get_og_chn(struct storvsc_device *stor_device,
 	}
 
 	/*
-	 * Our channel array is sparsley populated and we
+	 * Our channel array could be sparsley populated and we
 	 * initiated I/O on a processor/hw-q that does not
 	 * currently have a designated channel. Fix this.
 	 * The strategy is simple:
-	 * I. Ensure NUMA locality
-	 * II. Distribute evenly (best effort)
+	 * I. Prefer the channel associated with the current CPU
+	 * II. Ensure NUMA locality
+	 * III. Distribute evenly (best effort)
 	 */
 
+	/* Prefer the channel on the I/O issuing processor/hw-q */
+	if (cpumask_test_cpu(q_num, &stor_device->alloced_cpus))
+		return stor_device->stor_chns[q_num];
+
 	node_mask = cpumask_of_node(cpu_to_node(q_num));
 
 	num_channels = 0;
@@ -1469,59 +1474,48 @@ static int storvsc_do_io(struct hv_device *device,
 	/* See storvsc_change_target_cpu(). */
 	outgoing_channel = READ_ONCE(stor_device->stor_chns[q_num]);
 	if (outgoing_channel != NULL) {
-		if (outgoing_channel->target_cpu == q_num) {
-			/*
-			 * Ideally, we want to pick a different channel if
-			 * available on the same NUMA node.
-			 */
-			node_mask = cpumask_of_node(cpu_to_node(q_num));
-			for_each_cpu_wrap(tgt_cpu,
-				 &stor_device->alloced_cpus, q_num + 1) {
-				if (!cpumask_test_cpu(tgt_cpu, node_mask))
-					continue;
-				if (tgt_cpu == q_num)
-					continue;
-				channel = READ_ONCE(
-					stor_device->stor_chns[tgt_cpu]);
-				if (channel == NULL)
-					continue;
-				if (hv_get_avail_to_write_percent(
-							&channel->outbound)
-						> ring_avail_percent_lowater) {
-					outgoing_channel = channel;
-					goto found_channel;
-				}
-			}
+		if (hv_get_avail_to_write_percent(&outgoing_channel->outbound)
+				> ring_avail_percent_lowater)
+			goto found_channel;
 
-			/*
-			 * All the other channels on the same NUMA node are
-			 * busy. Try to use the channel on the current CPU
-			 */
-			if (hv_get_avail_to_write_percent(
-						&outgoing_channel->outbound)
-					> ring_avail_percent_lowater)
+		/*
+		 * Channel is busy, try to find a channel on the same NUMA node
+		 */
+		node_mask = cpumask_of_node(cpu_to_node(q_num));
+		for_each_cpu_wrap(tgt_cpu, &stor_device->alloced_cpus,
+				  q_num + 1) {
+			if (!cpumask_test_cpu(tgt_cpu, node_mask))
+				continue;
+			channel = READ_ONCE(stor_device->stor_chns[tgt_cpu]);
+			if (!channel)
+				continue;
+			if (hv_get_avail_to_write_percent(&channel->outbound)
+					> ring_avail_percent_lowater) {
+				outgoing_channel = channel;
 				goto found_channel;
+			}
+		}
 
-			/*
-			 * If we reach here, all the channels on the current
-			 * NUMA node are busy. Try to find a channel in
-			 * other NUMA nodes
-			 */
-			for_each_cpu(tgt_cpu, &stor_device->alloced_cpus) {
-				if (cpumask_test_cpu(tgt_cpu, node_mask))
-					continue;
-				channel = READ_ONCE(
-					stor_device->stor_chns[tgt_cpu]);
-				if (channel == NULL)
-					continue;
-				if (hv_get_avail_to_write_percent(
-							&channel->outbound)
-						> ring_avail_percent_lowater) {
-					outgoing_channel = channel;
-					goto found_channel;
-				}
+		/*
+		 * If we reach here, all the channels on the current
+		 * NUMA node are busy. Try to find a channel in
+		 * all NUMA nodes
+		 */
+		for_each_cpu_wrap(tgt_cpu, &stor_device->alloced_cpus,
+				  q_num + 1) {
+			channel = READ_ONCE(stor_device->stor_chns[tgt_cpu]);
+			if (!channel)
+				continue;
+			if (hv_get_avail_to_write_percent(&channel->outbound)
+					> ring_avail_percent_lowater) {
+				outgoing_channel = channel;
+				goto found_channel;
 			}
 		}
+		/*
+		 * If we reach here, all the channels are busy. Use the
+		 * original channel found.
+		 */
 	} else {
 		spin_lock_irqsave(&stor_device->lock, flags);
 		outgoing_channel = stor_device->stor_chns[q_num];

From ebc1e59bcedd0fe1c71e14f503c7de8dbdcdc91f Mon Sep 17 00:00:00 2001
From: Shreeya Patel <spatel@ciq.com>
Date: Wed, 17 Dec 2025 12:02:49 +0000
Subject: [PATCH 34/34] net: mana: Use page pool fragments for RX buffers
 instead of full pages to improve memory efficiency.

jira LE-4489
commit-author Dipayaan Roy <dipayanroy@linux.microsoft.com>
commit 730ff06d3f5cc2ce0348414b78c10528b767d4a3
upstream-diff This patch was causing build failures due to missing
commit 0f9214046893 ("memory-provider: dmabuf devmem memory provider")
To fix it, we have removed pprm.queue_idx parameter which seems
is not being used even after being set because of the missing commit.

This patch enhances RX buffer handling in the mana driver by allocating
pages from a page pool and slicing them into MTU-sized fragments, rather
than dedicating a full page per packet. This approach is especially
beneficial on systems with large base page sizes like 64KB.

Key improvements:

- Proper integration of page pool for RX buffer allocations.
- MTU-sized buffer slicing to improve memory utilization.
- Reduce overall per Rx queue memory footprint.
- Automatic fallback to full-page buffers when:
   * Jumbo frames are enabled (MTU > PAGE_SIZE / 2).
   * The XDP path is active, to avoid complexities with fragment reuse.

Testing on VMs with 64KB pages shows around 200% throughput improvement.
Memory efficiency is significantly improved due to reduced wastage in page
allocations. Example: We are now able to fit 35 rx buffers in a single 64kb
page for MTU size of 1500, instead of 1 rx buffer per page previously.

Tested:

- iperf3, iperf2, and nttcp benchmarks.
- Jumbo frames with MTU 9000.
- Native XDP programs (XDP_PASS, XDP_DROP, XDP_TX, XDP_REDIRECT) for
  testing the XDP path in driver.
- Memory leak detection (kmemleak).
- Driver load/unload, reboot, and stress scenarios.

	Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
	Reviewed-by: Saurabh Sengar <ssengar@linux.microsoft.com>
	Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
	Signed-off-by: Dipayaan Roy <dipayanroy@linux.microsoft.com>
Link: https://patch.msgid.link/20250814140410.GA22089@linuxonhyperv3.guj3yctzbm1etfxqx2vob5hsef.xx.internal.cloudapp.net
	Signed-off-by: Paolo Abeni <pabeni@redhat.com>

(cherry picked from commit 730ff06d3f5cc2ce0348414b78c10528b767d4a3)
	Signed-off-by: Shreeya Patel <spatel@ciq.com>
---
 .../net/ethernet/microsoft/mana/mana_bpf.c    |  46 +++++-
 drivers/net/ethernet/microsoft/mana/mana_en.c | 150 ++++++++++++------
 include/net/mana/mana.h                       |   4 +
 3 files changed, 149 insertions(+), 51 deletions(-)

diff --git a/drivers/net/ethernet/microsoft/mana/mana_bpf.c b/drivers/net/ethernet/microsoft/mana/mana_bpf.c
index d30721d4516fc..7697c9b52ed34 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_bpf.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_bpf.c
@@ -174,6 +174,7 @@ static int mana_xdp_set(struct net_device *ndev, struct bpf_prog *prog,
 	struct mana_port_context *apc = netdev_priv(ndev);
 	struct bpf_prog *old_prog;
 	struct gdma_context *gc;
+	int err;
 
 	gc = apc->ac->gdma_dev->gdma_context;
 
@@ -195,11 +196,45 @@ static int mana_xdp_set(struct net_device *ndev, struct bpf_prog *prog,
 	 */
 	apc->bpf_prog = prog;
 
-	if (old_prog)
-		bpf_prog_put(old_prog);
+	if (apc->port_is_up) {
+		/* Re-create rxq's after xdp prog was loaded or unloaded.
+		 * Ex: re create rxq's to switch from full pages to smaller
+		 * size page fragments when xdp prog is unloaded and
+		 * vice-versa.
+		 */
+
+		/* Pre-allocate buffers to prevent failure in mana_attach */
+		err = mana_pre_alloc_rxbufs(apc, ndev->mtu, apc->num_queues);
+		if (err) {
+			NL_SET_ERR_MSG_MOD(extack,
+					   "XDP: Insufficient memory for tx/rx re-config");
+			return err;
+		}
+
+		err = mana_detach(ndev, false);
+		if (err) {
+			netdev_err(ndev,
+				   "mana_detach failed at xdp set: %d\n", err);
+			NL_SET_ERR_MSG_MOD(extack,
+					   "XDP: Re-config failed at detach");
+			goto err_dealloc_rxbuffs;
+		}
+
+		err = mana_attach(ndev);
+		if (err) {
+			netdev_err(ndev,
+				   "mana_attach failed at xdp set: %d\n", err);
+			NL_SET_ERR_MSG_MOD(extack,
+					   "XDP: Re-config failed at attach");
+			goto err_dealloc_rxbuffs;
+		}
 
-	if (apc->port_is_up)
 		mana_chn_setxdp(apc, prog);
+		mana_pre_dealloc_rxbufs(apc);
+	}
+
+	if (old_prog)
+		bpf_prog_put(old_prog);
 
 	if (prog)
 		ndev->max_mtu = MANA_XDP_MTU_MAX;
@@ -207,6 +242,11 @@ static int mana_xdp_set(struct net_device *ndev, struct bpf_prog *prog,
 		ndev->max_mtu = gc->adapter_mtu - ETH_HLEN;
 
 	return 0;
+
+err_dealloc_rxbuffs:
+	apc->bpf_prog = old_prog;
+	mana_pre_dealloc_rxbufs(apc);
+	return err;
 }
 
 int mana_bpf(struct net_device *ndev, struct netdev_bpf *bpf)
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 240f99c8386db..427663e55ef3f 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -55,6 +55,15 @@ static bool mana_en_need_log(struct mana_port_context *apc, int err)
 		return true;
 }
 
+static void mana_put_rx_page(struct mana_rxq *rxq, struct page *page,
+			     bool from_pool)
+{
+	if (from_pool)
+		page_pool_put_full_page(rxq->page_pool, page, false);
+	else
+		put_page(page);
+}
+
 /* Microsoft Azure Network Adapter (MANA) functions */
 
 static int mana_open(struct net_device *ndev)
@@ -628,21 +637,40 @@ static void *mana_get_rxbuf_pre(struct mana_rxq *rxq, dma_addr_t *da)
 }
 
 /* Get RX buffer's data size, alloc size, XDP headroom based on MTU */
-static void mana_get_rxbuf_cfg(int mtu, u32 *datasize, u32 *alloc_size,
-			       u32 *headroom)
+static void mana_get_rxbuf_cfg(struct mana_port_context *apc,
+			       int mtu, u32 *datasize, u32 *alloc_size,
+			       u32 *headroom, u32 *frag_count)
 {
-	if (mtu > MANA_XDP_MTU_MAX)
-		*headroom = 0; /* no support for XDP */
-	else
-		*headroom = XDP_PACKET_HEADROOM;
+	u32 len, buf_size;
 
-	*alloc_size = SKB_DATA_ALIGN(mtu + MANA_RXBUF_PAD + *headroom);
+	/* Calculate datasize first (consistent across all cases) */
+	*datasize = mtu + ETH_HLEN;
 
-	/* Using page pool in this case, so alloc_size is PAGE_SIZE */
-	if (*alloc_size < PAGE_SIZE)
-		*alloc_size = PAGE_SIZE;
+	/* For xdp and jumbo frames make sure only one packet fits per page */
+	if (mtu + MANA_RXBUF_PAD > PAGE_SIZE / 2 || mana_xdp_get(apc)) {
+		if (mana_xdp_get(apc)) {
+			*headroom = XDP_PACKET_HEADROOM;
+			*alloc_size = PAGE_SIZE;
+		} else {
+			*headroom = 0; /* no support for XDP */
+			*alloc_size = SKB_DATA_ALIGN(mtu + MANA_RXBUF_PAD +
+						     *headroom);
+		}
 
-	*datasize = mtu + ETH_HLEN;
+		*frag_count = 1;
+		return;
+	}
+
+	/* Standard MTU case - optimize for multiple packets per page */
+	*headroom = 0;
+
+	/* Calculate base buffer size needed */
+	len = SKB_DATA_ALIGN(mtu + MANA_RXBUF_PAD + *headroom);
+	buf_size = ALIGN(len, MANA_RX_FRAG_ALIGNMENT);
+
+	/* Calculate how many packets can fit in a page */
+	*frag_count = PAGE_SIZE / buf_size;
+	*alloc_size = buf_size;
 }
 
 int mana_pre_alloc_rxbufs(struct mana_port_context *mpc, int new_mtu, int num_queues)
@@ -654,8 +682,9 @@ int mana_pre_alloc_rxbufs(struct mana_port_context *mpc, int new_mtu, int num_qu
 	void *va;
 	int i;
 
-	mana_get_rxbuf_cfg(new_mtu, &mpc->rxbpre_datasize,
-			   &mpc->rxbpre_alloc_size, &mpc->rxbpre_headroom);
+	mana_get_rxbuf_cfg(mpc, new_mtu, &mpc->rxbpre_datasize,
+			   &mpc->rxbpre_alloc_size, &mpc->rxbpre_headroom,
+			   &mpc->rxbpre_frag_count);
 
 	dev = mpc->ac->gdma_dev->gdma_context->dev;
 
@@ -1840,8 +1869,11 @@ static void mana_rx_skb(void *buf_va, bool from_pool,
 
 drop:
 	if (from_pool) {
-		page_pool_recycle_direct(rxq->page_pool,
-					 virt_to_head_page(buf_va));
+		if (rxq->frag_count == 1)
+			page_pool_recycle_direct(rxq->page_pool,
+						 virt_to_head_page(buf_va));
+		else
+			page_pool_free_va(rxq->page_pool, buf_va, true);
 	} else {
 		WARN_ON_ONCE(rxq->xdp_save_va);
 		/* Save for reuse */
@@ -1857,33 +1889,46 @@ static void *mana_get_rxfrag(struct mana_rxq *rxq, struct device *dev,
 			     dma_addr_t *da, bool *from_pool)
 {
 	struct page *page;
+	u32 offset;
 	void *va;
-
 	*from_pool = false;
 
-	/* Reuse XDP dropped page if available */
-	if (rxq->xdp_save_va) {
-		va = rxq->xdp_save_va;
-		rxq->xdp_save_va = NULL;
-	} else {
-		page = page_pool_dev_alloc_pages(rxq->page_pool);
-		if (!page)
+	/* Don't use fragments for jumbo frames or XDP where it's 1 fragment
+	 * per page.
+	 */
+	if (rxq->frag_count == 1) {
+		/* Reuse XDP dropped page if available */
+		if (rxq->xdp_save_va) {
+			va = rxq->xdp_save_va;
+			page = virt_to_head_page(va);
+			rxq->xdp_save_va = NULL;
+		} else {
+			page = page_pool_dev_alloc_pages(rxq->page_pool);
+			if (!page)
+				return NULL;
+
+			*from_pool = true;
+			va = page_to_virt(page);
+		}
+
+		*da = dma_map_single(dev, va + rxq->headroom, rxq->datasize,
+				     DMA_FROM_DEVICE);
+		if (dma_mapping_error(dev, *da)) {
+			mana_put_rx_page(rxq, page, *from_pool);
 			return NULL;
+		}
 
-		*from_pool = true;
-		va = page_to_virt(page);
+		return va;
 	}
 
-	*da = dma_map_single(dev, va + rxq->headroom, rxq->datasize,
-			     DMA_FROM_DEVICE);
-	if (dma_mapping_error(dev, *da)) {
-		if (*from_pool)
-			page_pool_put_full_page(rxq->page_pool, page, false);
-		else
-			put_page(virt_to_head_page(va));
-
+	page =  page_pool_dev_alloc_frag(rxq->page_pool, &offset,
+					 rxq->alloc_size);
+	if (!page)
 		return NULL;
-	}
+
+	va  = page_to_virt(page) + offset;
+	*da = page_pool_get_dma_addr(page) + offset + rxq->headroom;
+	*from_pool = true;
 
 	return va;
 }
@@ -1900,9 +1945,9 @@ static void mana_refill_rx_oob(struct device *dev, struct mana_rxq *rxq,
 	va = mana_get_rxfrag(rxq, dev, &da, &from_pool);
 	if (!va)
 		return;
-
-	dma_unmap_single(dev, rxoob->sgl[0].address, rxq->datasize,
-			 DMA_FROM_DEVICE);
+	if (!rxoob->from_pool || rxq->frag_count == 1)
+		dma_unmap_single(dev, rxoob->sgl[0].address, rxq->datasize,
+				 DMA_FROM_DEVICE);
 	*old_buf = rxoob->buf_va;
 	*old_fp = rxoob->from_pool;
 
@@ -2307,15 +2352,15 @@ static void mana_destroy_rxq(struct mana_port_context *apc,
 		if (!rx_oob->buf_va)
 			continue;
 
-		dma_unmap_single(dev, rx_oob->sgl[0].address,
-				 rx_oob->sgl[0].size, DMA_FROM_DEVICE);
-
 		page = virt_to_head_page(rx_oob->buf_va);
 
-		if (rx_oob->from_pool)
-			page_pool_put_full_page(rxq->page_pool, page, false);
-		else
-			put_page(page);
+		if (rxq->frag_count == 1 || !rx_oob->from_pool) {
+			dma_unmap_single(dev, rx_oob->sgl[0].address,
+					 rx_oob->sgl[0].size, DMA_FROM_DEVICE);
+			mana_put_rx_page(rxq, page, rx_oob->from_pool);
+		} else {
+			page_pool_free_va(rxq->page_pool, rx_oob->buf_va, true);
+		}
 
 		rx_oob->buf_va = NULL;
 	}
@@ -2421,11 +2466,21 @@ static int mana_create_page_pool(struct mana_rxq *rxq, struct gdma_context *gc)
 	struct page_pool_params pprm = {};
 	int ret;
 
-	pprm.pool_size = mpc->rx_queue_size;
+	pprm.pool_size = mpc->rx_queue_size / rxq->frag_count + 1;
 	pprm.nid = gc->numa_node;
 	pprm.napi = &rxq->rx_cq.napi;
 	pprm.netdev = rxq->ndev;
 	pprm.order = get_order(rxq->alloc_size);
+	pprm.dev = gc->dev;
+
+	/* Let the page pool do the dma map when page sharing with multiple
+	 * fragments enabled for rx buffers.
+	 */
+	if (rxq->frag_count > 1) {
+		pprm.flags =  PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV;
+		pprm.max_len = PAGE_SIZE;
+		pprm.dma_dir = DMA_FROM_DEVICE;
+	}
 
 	rxq->page_pool = page_pool_create(&pprm);
 
@@ -2464,9 +2519,8 @@ static struct mana_rxq *mana_create_rxq(struct mana_port_context *apc,
 	rxq->rxq_idx = rxq_idx;
 	rxq->rxobj = INVALID_MANA_HANDLE;
 
-	mana_get_rxbuf_cfg(ndev->mtu, &rxq->datasize, &rxq->alloc_size,
-			   &rxq->headroom);
-
+	mana_get_rxbuf_cfg(apc, ndev->mtu, &rxq->datasize, &rxq->alloc_size,
+			   &rxq->headroom, &rxq->frag_count);
 	/* Create page pool for RX queue */
 	err = mana_create_page_pool(rxq, gc);
 	if (err) {
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index 550d8be6f79bb..fa8fc9163d240 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -64,6 +64,8 @@ enum TRI_STATE {
 #define MANA_STATS_RX_COUNT 5
 #define MANA_STATS_TX_COUNT 11
 
+#define MANA_RX_FRAG_ALIGNMENT 64
+
 struct mana_stats_rx {
 	u64 packets;
 	u64 bytes;
@@ -327,6 +329,7 @@ struct mana_rxq {
 	u32 datasize;
 	u32 alloc_size;
 	u32 headroom;
+	u32 frag_count;
 
 	mana_handle_t rxobj;
 
@@ -509,6 +512,7 @@ struct mana_port_context {
 	u32 rxbpre_datasize;
 	u32 rxbpre_alloc_size;
 	u32 rxbpre_headroom;
+	u32 rxbpre_frag_count;
 
 	struct bpf_prog *bpf_prog;