diff --git a/rust-toolchain b/rust-toolchain
deleted file mode 100644
index af92bdd9..00000000
--- a/rust-toolchain
+++ /dev/null
@@ -1 +0,0 @@
-1.63.0
diff --git a/src/bn256/assembly.rs b/src/bn256/assembly.rs
index 32eb8092..1a47655d 100644
--- a/src/bn256/assembly.rs
+++ b/src/bn256/assembly.rs
@@ -1,380 +1,11 @@
-macro_rules! assembly_field {
+macro_rules! field_arithmetic_asm {
     (
         $field:ident,
         $modulus:ident,
-        $inv:ident,
-        $modulus_str:ident,
-        $two_inv:ident,
-        $root_of_unity_inv:ident,
-        $delta:ident,
-        $zeta:ident,
-        $r:ident,
-        $r2:ident,
-        $r3:ident
+        $inv:ident
     ) => {
         use std::arch::asm;
 
-        impl $field {
-            /// Returns zero, the additive identity.
-            #[inline]
-            pub const fn zero() -> $field {
-                $field([0, 0, 0, 0])
-            }
-
-            /// Returns one, the multiplicative identity.
-            #[inline]
-            pub const fn one() -> $field {
-                $r
-            }
-
-            fn from_u512(limbs: [u64; 8]) -> $field {
-                // We reduce an arbitrary 512-bit number by decomposing it into two 256-bit digits
-                // with the higher bits multiplied by 2^256. Thus, we perform two reductions
-                //
-                // 1. the lower bits are multiplied by R^2, as normal
-                // 2. the upper bits are multiplied by R^2 * 2^256 = R^3
-                //
-                // and computing their sum in the field. It remains to see that arbitrary 256-bit
-                // numbers can be placed into Montgomery form safely using the reduction. The
-                // reduction works so long as the product is less than R=2^256 multiplied by
-                // the modulus. This holds because for any `c` smaller than the modulus, we have
-                // that (2^256 - 1)*c is an acceptable product for the reduction. Therefore, the
-                // reduction always works so long as `c` is in the field; in this case it is either the
-                // constant `R2` or `R3`.
-                let d0 = $field([limbs[0], limbs[1], limbs[2], limbs[3]]);
-                let d1 = $field([limbs[4], limbs[5], limbs[6], limbs[7]]);
-                // Convert to Montgomery form
-                d0 * $r2 + d1 * $r3
-            }
-
-            /// Converts from an integer represented in little endian
-            /// into its (congruent) `$field` representation.
-            pub const fn from_raw(val: [u64; 4]) -> Self {
-                // Multiplication
-                let (r0, carry) = mac(0, val[0], $r2.0[0], 0);
-                let (r1, carry) = mac(0, val[0], $r2.0[1], carry);
-                let (r2, carry) = mac(0, val[0], $r2.0[2], carry);
-                let (r3, r4) = mac(0, val[0], $r2.0[3], carry);
-
-                let (r1, carry) = mac(r1, val[1], $r2.0[0], 0);
-                let (r2, carry) = mac(r2, val[1], $r2.0[1], carry);
-                let (r3, carry) = mac(r3, val[1], $r2.0[2], carry);
-                let (r4, r5) = mac(r4, val[1], $r2.0[3], carry);
-
-                let (r2, carry) = mac(r2, val[2], $r2.0[0], 0);
-                let (r3, carry) = mac(r3, val[2], $r2.0[1], carry);
-                let (r4, carry) = mac(r4, val[2], $r2.0[2], carry);
-                let (r5, r6) = mac(r5, val[2], $r2.0[3], carry);
-
-                let (r3, carry) = mac(r3, val[3], $r2.0[0], 0);
-                let (r4, carry) = mac(r4, val[3], $r2.0[1], carry);
-                let (r5, carry) = mac(r5, val[3], $r2.0[2], carry);
-                let (r6, r7) = mac(r6, val[3], $r2.0[3], carry);
-
-                // Montgomery reduction (first part)
-                let k = r0.wrapping_mul($inv);
-                let (_, carry) = mac(r0, k, $modulus.0[0], 0);
-                let (r1, carry) = mac(r1, k, $modulus.0[1], carry);
-                let (r2, carry) = mac(r2, k, $modulus.0[2], carry);
-                let (r3, carry) = mac(r3, k, $modulus.0[3], carry);
-                let (r4, carry2) = adc(r4, 0, carry);
-
-                let k = r1.wrapping_mul($inv);
-                let (_, carry) = mac(r1, k, $modulus.0[0], 0);
-                let (r2, carry) = mac(r2, k, $modulus.0[1], carry);
-                let (r3, carry) = mac(r3, k, $modulus.0[2], carry);
-                let (r4, carry) = mac(r4, k, $modulus.0[3], carry);
-                let (r5, carry2) = adc(r5, carry2, carry);
-
-                let k = r2.wrapping_mul($inv);
-                let (_, carry) = mac(r2, k, $modulus.0[0], 0);
-                let (r3, carry) = mac(r3, k, $modulus.0[1], carry);
-                let (r4, carry) = mac(r4, k, $modulus.0[2], carry);
-                let (r5, carry) = mac(r5, k, $modulus.0[3], carry);
-                let (r6, carry2) = adc(r6, carry2, carry);
-
-                let k = r3.wrapping_mul($inv);
-                let (_, carry) = mac(r3, k, $modulus.0[0], 0);
-                let (r4, carry) = mac(r4, k, $modulus.0[1], carry);
-                let (r5, carry) = mac(r5, k, $modulus.0[2], carry);
-                let (r6, carry) = mac(r6, k, $modulus.0[3], carry);
-                let (r7, _) = adc(r7, carry2, carry);
-
-                // Montgomery reduction (sub part)
-                let (d0, borrow) = sbb(r4, $modulus.0[0], 0);
-                let (d1, borrow) = sbb(r5, $modulus.0[1], borrow);
-                let (d2, borrow) = sbb(r6, $modulus.0[2], borrow);
-                let (d3, borrow) = sbb(r7, $modulus.0[3], borrow);
-
-                let (d0, carry) = adc(d0, $modulus.0[0] & borrow, 0);
-                let (d1, carry) = adc(d1, $modulus.0[1] & borrow, carry);
-                let (d2, carry) = adc(d2, $modulus.0[2] & borrow, carry);
-                let (d3, _) = adc(d3, $modulus.0[3] & borrow, carry);
-
-                $field([d0, d1, d2, d3])
-            }
-
-            /// Attempts to convert a little-endian byte representation of
-            /// a scalar into a `Fr`, failing if the input is not canonical.
-            pub fn from_bytes(bytes: &[u8; 32]) -> CtOption<$field> {
-                <Self as ff::PrimeField>::from_repr(*bytes)
-            }
-
-            /// Converts an element of `Fr` into a byte representation in
-            /// little-endian byte order.
-            pub fn to_bytes(&self) -> [u8; 32] {
-                <Self as ff::PrimeField>::to_repr(self)
-            }
-        }
-
-        impl Group for $field {
-            type Scalar = Self;
-
-            fn group_zero() -> Self {
-                Self::zero()
-            }
-            fn group_add(&mut self, rhs: &Self) {
-                *self += *rhs;
-            }
-            fn group_sub(&mut self, rhs: &Self) {
-                *self -= *rhs;
-            }
-            fn group_scale(&mut self, by: &Self::Scalar) {
-                *self *= *by;
-            }
-        }
-
-        impl fmt::Debug for $field {
-            fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-                let tmp = self.to_repr();
-                write!(f, "0x")?;
-                for &b in tmp.iter().rev() {
-                    write!(f, "{:02x}", b)?;
-                }
-                Ok(())
-            }
-        }
-
-        impl Default for $field {
-            #[inline]
-            fn default() -> Self {
-                Self::zero()
-            }
-        }
-
-        impl From<bool> for $field {
-            fn from(bit: bool) -> $field {
-                if bit {
-                    $field::one()
-                } else {
-                    $field::zero()
-                }
-            }
-        }
-
-        impl From<u64> for $field {
-            fn from(val: u64) -> $field {
-                $field([val, 0, 0, 0]) * $r2
-            }
-        }
-
-        impl ConstantTimeEq for $field {
-            fn ct_eq(&self, other: &Self) -> Choice {
-                self.0[0].ct_eq(&other.0[0])
-                    & self.0[1].ct_eq(&other.0[1])
-                    & self.0[2].ct_eq(&other.0[2])
-                    & self.0[3].ct_eq(&other.0[3])
-            }
-        }
-
-        impl core::cmp::Ord for $field {
-            fn cmp(&self, other: &Self) -> core::cmp::Ordering {
-                let left = self.to_repr();
-                let right = other.to_repr();
-                left.iter()
-                    .zip(right.iter())
-                    .rev()
-                    .find_map(|(left_byte, right_byte)| match left_byte.cmp(right_byte) {
-                        core::cmp::Ordering::Equal => None,
-                        res => Some(res),
-                    })
-                    .unwrap_or(core::cmp::Ordering::Equal)
-            }
-        }
-
-        impl core::cmp::PartialOrd for $field {
-            fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
-                Some(self.cmp(other))
-            }
-        }
-
-        impl ConditionallySelectable for $field {
-            fn conditional_select(a: &Self, b: &Self, choice: Choice) -> Self {
-                $field([
-                    u64::conditional_select(&a.0[0], &b.0[0], choice),
-                    u64::conditional_select(&a.0[1], &b.0[1], choice),
-                    u64::conditional_select(&a.0[2], &b.0[2], choice),
-                    u64::conditional_select(&a.0[3], &b.0[3], choice),
-                ])
-            }
-        }
-
-        impl<'a> Neg for &'a $field {
-            type Output = $field;
-
-            #[inline]
-            fn neg(self) -> $field {
-                self.neg()
-            }
-        }
-
-        impl Neg for $field {
-            type Output = $field;
-
-            #[inline]
-            fn neg(self) -> $field {
-                -&self
-            }
-        }
-
-        impl<'a, 'b> Sub<&'b $field> for &'a $field {
-            type Output = $field;
-
-            #[inline]
-            fn sub(self, rhs: &'b $field) -> $field {
-                self.sub(rhs)
-            }
-        }
-
-        impl<'a, 'b> Add<&'b $field> for &'a $field {
-            type Output = $field;
-
-            #[inline]
-            fn add(self, rhs: &'b $field) -> $field {
-                self.add(rhs)
-            }
-        }
-
-        impl<'a, 'b> Mul<&'b $field> for &'a $field {
-            type Output = $field;
-
-            #[inline]
-            fn mul(self, rhs: &'b $field) -> $field {
-                self.mul(rhs)
-            }
-        }
-
-        impl From<$field> for [u8; 32] {
-            fn from(value: $field) -> [u8; 32] {
-                value.to_repr()
-            }
-        }
-
-        impl<'a> From<&'a $field> for [u8; 32] {
-            fn from(value: &'a $field) -> [u8; 32] {
-                value.to_repr()
-            }
-        }
-
-        impl FieldExt for $field {
-            const MODULUS: &'static str = $modulus_str;
-            const TWO_INV: Self = $two_inv;
-            const ROOT_OF_UNITY_INV: Self = $root_of_unity_inv;
-            const DELTA: Self = $delta;
-            const ZETA: Self = $zeta;
-
-            fn from_u128(v: u128) -> Self {
-                $field::from_raw([v as u64, (v >> 64) as u64, 0, 0])
-            }
-
-            /// Converts a 512-bit little endian integer into
-            /// a `$field` by reducing by the modulus.
-            fn from_bytes_wide(bytes: &[u8; 64]) -> $field {
-                $field::from_u512([
-                    u64::from_le_bytes(bytes[0..8].try_into().unwrap()),
-                    u64::from_le_bytes(bytes[8..16].try_into().unwrap()),
-                    u64::from_le_bytes(bytes[16..24].try_into().unwrap()),
-                    u64::from_le_bytes(bytes[24..32].try_into().unwrap()),
-                    u64::from_le_bytes(bytes[32..40].try_into().unwrap()),
-                    u64::from_le_bytes(bytes[40..48].try_into().unwrap()),
-                    u64::from_le_bytes(bytes[48..56].try_into().unwrap()),
-                    u64::from_le_bytes(bytes[56..64].try_into().unwrap()),
-                ])
-            }
-
-            fn get_lower_128(&self) -> u128 {
-                let tmp = $field::montgomery_reduce(&[
-                    self.0[0], self.0[1], self.0[2], self.0[3], 0, 0, 0, 0,
-                ]);
-
-                u128::from(tmp.0[0]) | (u128::from(tmp.0[1]) << 64)
-            }
-        }
-
-        impl $crate::serde::SerdeObject for $field {
-            fn from_raw_bytes_unchecked(bytes: &[u8]) -> Self {
-                debug_assert_eq!(bytes.len(), 32);
-                let inner =
-                    [0, 8, 16, 24].map(|i| u64::from_le_bytes(bytes[i..i + 8].try_into().unwrap()));
-                Self(inner)
-            }
-            fn from_raw_bytes(bytes: &[u8]) -> Option<Self> {
-                if bytes.len() != 32 {
-                    return None;
-                }
-                let elt = Self::from_raw_bytes_unchecked(bytes);
-                is_less_than(&elt.0, &$modulus.0).then(|| elt)
-            }
-            fn to_raw_bytes(&self) -> Vec<u8> {
-                let mut res = Vec::with_capacity(32);
-                for limb in self.0.iter() {
-                    res.extend_from_slice(&limb.to_le_bytes());
-                }
-                res
-            }
-            fn read_raw_unchecked<R: std::io::Read>(reader: &mut R) -> Self {
-                let inner = [(); 4].map(|_| {
-                    let mut buf = [0; 8];
-                    reader.read_exact(&mut buf).unwrap();
-                    u64::from_le_bytes(buf)
-                });
-                Self(inner)
-            }
-            fn read_raw<R: std::io::Read>(reader: &mut R) -> std::io::Result<Self> {
-                let mut inner = [0u64; 4];
-                for limb in inner.iter_mut() {
-                    let mut buf = [0; 8];
-                    reader.read_exact(&mut buf)?;
-                    *limb = u64::from_le_bytes(buf);
-                }
-                let elt = Self(inner);
-                is_less_than(&elt.0, &$modulus.0)
-                    .then(|| elt)
-                    .ok_or_else(|| {
-                        std::io::Error::new(
-                            std::io::ErrorKind::InvalidData,
-                            "input number is not less than field modulus",
-                        )
-                    })
-            }
-            fn write_raw<W: std::io::Write>(&self, writer: &mut W) -> std::io::Result<()> {
-                for limb in self.0.iter() {
-                    writer.write_all(&limb.to_le_bytes())?;
-                }
-                Ok(())
-            }
-        }
-
-        /// Lexicographic comparison of Montgomery forms.
-        #[inline(always)]
-        fn is_less_than(x: &[u64; 4], y: &[u64; 4]) -> bool {
-            let (_, borrow) = sbb(x[0], y[0], 0);
-            let (_, borrow) = sbb(x[1], y[1], borrow);
-            let (_, borrow) = sbb(x[2], y[2], borrow);
-            let (_, borrow) = sbb(x[3], y[3], borrow);
-            borrow >> 63 == 1
-        }
-
         impl $field {
             /// Doubles this field element.
             #[inline]
@@ -1364,4 +995,4 @@ macro_rules! assembly_field {
     };
 }
 
-pub(crate) use assembly_field;
+pub(crate) use field_arithmetic_asm;
diff --git a/src/bn256/curve.rs b/src/bn256/curve.rs
index 4507d6bf..26d072df 100644
--- a/src/bn256/curve.rs
+++ b/src/bn256/curve.rs
@@ -142,12 +142,8 @@ impl CurveEndo for G1 {
     }
 
     fn endomorphism_scalars(k: &Self::ScalarExt) -> (u128, u128) {
-        #[cfg(feature = "asm")]
         let input = Fr::montgomery_reduce(&[k.0[0], k.0[1], k.0[2], k.0[3], 0, 0, 0, 0]).0;
 
-        #[cfg(not(feature = "asm"))]
-        let input = Fr::montgomery_reduce(k.0[0], k.0[1], k.0[2], k.0[3], 0, 0, 0, 0).0;
-
         let c1_512 = mul_512(ENDO_G2, input);
         let c2_512 = mul_512(ENDO_G1, input);
 
diff --git a/src/bn256/fq.rs b/src/bn256/fq.rs
index 4866cc0c..b0f6be67 100644
--- a/src/bn256/fq.rs
+++ b/src/bn256/fq.rs
@@ -1,5 +1,7 @@
 #[cfg(feature = "asm")]
-use super::assembly::assembly_field;
+use super::assembly::field_arithmetic_asm;
+#[cfg(not(feature = "asm"))]
+use crate::{field_arithmetic, field_specific};
 
 use super::LegendreSymbol;
 use crate::arithmetic::{adc, mac, sbb};
@@ -90,13 +92,12 @@ const ZETA: Fq = Fq::from_raw([
 ]);
 
 use crate::{
-    field_arithmetic, field_common, field_specific, impl_add_binop_specify_output,
-    impl_binops_additive, impl_binops_additive_specify_output, impl_binops_multiplicative,
+    field_common, impl_add_binop_specify_output, impl_binops_additive,
+    impl_binops_additive_specify_output, impl_binops_multiplicative,
     impl_binops_multiplicative_mixed, impl_sub_binop_specify_output,
 };
 impl_binops_additive!(Fq, Fq);
 impl_binops_multiplicative!(Fq, Fq);
-#[cfg(not(feature = "asm"))]
 field_common!(
     Fq,
     MODULUS,
@@ -113,19 +114,7 @@ field_common!(
 #[cfg(not(feature = "asm"))]
 field_arithmetic!(Fq, MODULUS, INV, sparse);
 #[cfg(feature = "asm")]
-assembly_field!(
-    Fq,
-    MODULUS,
-    INV,
-    MODULUS_STR,
-    TWO_INV,
-    ROOT_OF_UNITY_INV,
-    DELTA,
-    ZETA,
-    R,
-    R2,
-    R3
-);
+field_arithmetic_asm!(Fq, MODULUS, INV);
 
 impl Fq {
     pub const fn size() -> usize {
@@ -240,13 +229,9 @@ impl ff::PrimeField for Fq {
     fn to_repr(&self) -> Self::Repr {
         // Turn into canonical form by computing
         // (a.R) / R = a
-        #[cfg(feature = "asm")]
         let tmp =
             Self::montgomery_reduce(&[self.0[0], self.0[1], self.0[2], self.0[3], 0, 0, 0, 0]);
 
-        #[cfg(not(feature = "asm"))]
-        let tmp = Self::montgomery_reduce(self.0[0], self.0[1], self.0[2], self.0[3], 0, 0, 0, 0);
-
         let mut res = [0; 32];
         res[0..8].copy_from_slice(&tmp.0[0].to_le_bytes());
         res[8..16].copy_from_slice(&tmp.0[1].to_le_bytes());
@@ -273,10 +258,6 @@ impl SqrtRatio for Fq {
     const T_MINUS1_OVER2: [u64; 4] = [0, 0, 0, 0];
 
     fn get_lower_32(&self) -> u32 {
-        #[cfg(not(feature = "asm"))]
-        let tmp = Fq::montgomery_reduce(self.0[0], self.0[1], self.0[2], self.0[3], 0, 0, 0, 0);
-
-        #[cfg(feature = "asm")]
         let tmp = Fq::montgomery_reduce(&[self.0[0], self.0[1], self.0[2], self.0[3], 0, 0, 0, 0]);
 
         tmp.0[0] as u32
diff --git a/src/bn256/fr.rs b/src/bn256/fr.rs
index 8901a896..50a3ada4 100644
--- a/src/bn256/fr.rs
+++ b/src/bn256/fr.rs
@@ -1,5 +1,7 @@
 #[cfg(feature = "asm")]
-use super::assembly::assembly_field;
+use super::assembly::field_arithmetic_asm;
+#[cfg(not(feature = "asm"))]
+use crate::{field_arithmetic, field_specific};
 
 use crate::arithmetic::{adc, mac, sbb};
 use core::convert::TryInto;
@@ -115,13 +117,12 @@ const ZETA: Fr = Fr::from_raw([
 ]);
 
 use crate::{
-    field_arithmetic, field_common, field_specific, impl_add_binop_specify_output,
-    impl_binops_additive, impl_binops_additive_specify_output, impl_binops_multiplicative,
+    field_common, impl_add_binop_specify_output, impl_binops_additive,
+    impl_binops_additive_specify_output, impl_binops_multiplicative,
     impl_binops_multiplicative_mixed, impl_sub_binop_specify_output,
 };
 impl_binops_additive!(Fr, Fr);
 impl_binops_multiplicative!(Fr, Fr);
-#[cfg(not(feature = "asm"))]
 field_common!(
     Fr,
     MODULUS,
@@ -138,19 +139,7 @@ field_common!(
 #[cfg(not(feature = "asm"))]
 field_arithmetic!(Fr, MODULUS, INV, sparse);
 #[cfg(feature = "asm")]
-assembly_field!(
-    Fr,
-    MODULUS,
-    INV,
-    MODULUS_STR,
-    TWO_INV,
-    ROOT_OF_UNITY_INV,
-    DELTA,
-    ZETA,
-    R,
-    R2,
-    R3
-);
+field_arithmetic_asm!(Fr, MODULUS, INV);
 
 impl ff::Field for Fr {
     fn random(mut rng: impl RngCore) -> Self {
@@ -238,12 +227,8 @@ impl ff::PrimeField for Fr {
     fn to_repr(&self) -> Self::Repr {
         // Turn into canonical form by computing
         // (a.R) / R = a
-        #[cfg(feature = "asm")]
         let tmp = Fr::montgomery_reduce(&[self.0[0], self.0[1], self.0[2], self.0[3], 0, 0, 0, 0]);
 
-        #[cfg(not(feature = "asm"))]
-        let tmp = Fr::montgomery_reduce(self.0[0], self.0[1], self.0[2], self.0[3], 0, 0, 0, 0);
-
         let mut res = [0; 32];
         res[0..8].copy_from_slice(&tmp.0[0].to_le_bytes());
         res[8..16].copy_from_slice(&tmp.0[1].to_le_bytes());
@@ -276,12 +261,7 @@ impl SqrtRatio for Fr {
     ];
 
     fn get_lower_32(&self) -> u32 {
-        #[cfg(not(feature = "asm"))]
-        let tmp = Fr::montgomery_reduce(self.0[0], self.0[1], self.0[2], self.0[3], 0, 0, 0, 0);
-
-        #[cfg(feature = "asm")]
         let tmp = Fr::montgomery_reduce(&[self.0[0], self.0[1], self.0[2], self.0[3], 0, 0, 0, 0]);
-
         tmp.0[0] as u32
     }
 }
diff --git a/src/derive/field.rs b/src/derive/field.rs
index 269372dc..d0245193 100644
--- a/src/derive/field.rs
+++ b/src/derive/field.rs
@@ -49,7 +49,74 @@ macro_rules! field_common {
             /// Converts from an integer represented in little endian
             /// into its (congruent) `$field` representation.
             pub const fn from_raw(val: [u64; 4]) -> Self {
-                (&$field(val)).mul(&$r2)
+                #[cfg(feature = "asm")]
+                {
+                    let (r0, carry) = mac(0, val[0], $r2.0[0], 0);
+                    let (r1, carry) = mac(0, val[0], $r2.0[1], carry);
+                    let (r2, carry) = mac(0, val[0], $r2.0[2], carry);
+                    let (r3, r4) = mac(0, val[0], $r2.0[3], carry);
+
+                    let (r1, carry) = mac(r1, val[1], $r2.0[0], 0);
+                    let (r2, carry) = mac(r2, val[1], $r2.0[1], carry);
+                    let (r3, carry) = mac(r3, val[1], $r2.0[2], carry);
+                    let (r4, r5) = mac(r4, val[1], $r2.0[3], carry);
+
+                    let (r2, carry) = mac(r2, val[2], $r2.0[0], 0);
+                    let (r3, carry) = mac(r3, val[2], $r2.0[1], carry);
+                    let (r4, carry) = mac(r4, val[2], $r2.0[2], carry);
+                    let (r5, r6) = mac(r5, val[2], $r2.0[3], carry);
+
+                    let (r3, carry) = mac(r3, val[3], $r2.0[0], 0);
+                    let (r4, carry) = mac(r4, val[3], $r2.0[1], carry);
+                    let (r5, carry) = mac(r5, val[3], $r2.0[2], carry);
+                    let (r6, r7) = mac(r6, val[3], $r2.0[3], carry);
+
+                    // Montgomery reduction (first part)
+                    let k = r0.wrapping_mul($inv);
+                    let (_, carry) = mac(r0, k, $modulus.0[0], 0);
+                    let (r1, carry) = mac(r1, k, $modulus.0[1], carry);
+                    let (r2, carry) = mac(r2, k, $modulus.0[2], carry);
+                    let (r3, carry) = mac(r3, k, $modulus.0[3], carry);
+                    let (r4, carry2) = adc(r4, 0, carry);
+
+                    let k = r1.wrapping_mul($inv);
+                    let (_, carry) = mac(r1, k, $modulus.0[0], 0);
+                    let (r2, carry) = mac(r2, k, $modulus.0[1], carry);
+                    let (r3, carry) = mac(r3, k, $modulus.0[2], carry);
+                    let (r4, carry) = mac(r4, k, $modulus.0[3], carry);
+                    let (r5, carry2) = adc(r5, carry2, carry);
+
+                    let k = r2.wrapping_mul($inv);
+                    let (_, carry) = mac(r2, k, $modulus.0[0], 0);
+                    let (r3, carry) = mac(r3, k, $modulus.0[1], carry);
+                    let (r4, carry) = mac(r4, k, $modulus.0[2], carry);
+                    let (r5, carry) = mac(r5, k, $modulus.0[3], carry);
+                    let (r6, carry2) = adc(r6, carry2, carry);
+
+                    let k = r3.wrapping_mul($inv);
+                    let (_, carry) = mac(r3, k, $modulus.0[0], 0);
+                    let (r4, carry) = mac(r4, k, $modulus.0[1], carry);
+                    let (r5, carry) = mac(r5, k, $modulus.0[2], carry);
+                    let (r6, carry) = mac(r6, k, $modulus.0[3], carry);
+                    let (r7, _) = adc(r7, carry2, carry);
+
+                    // Montgomery reduction (sub part)
+                    let (d0, borrow) = sbb(r4, $modulus.0[0], 0);
+                    let (d1, borrow) = sbb(r5, $modulus.0[1], borrow);
+                    let (d2, borrow) = sbb(r6, $modulus.0[2], borrow);
+                    let (d3, borrow) = sbb(r7, $modulus.0[3], borrow);
+
+                    let (d0, carry) = adc(d0, $modulus.0[0] & borrow, 0);
+                    let (d1, carry) = adc(d1, $modulus.0[1] & borrow, carry);
+                    let (d2, carry) = adc(d2, $modulus.0[2] & borrow, carry);
+                    let (d3, _) = adc(d3, $modulus.0[3] & borrow, carry);
+
+                    $field([d0, d1, d2, d3])
+                }
+                #[cfg(not(feature = "asm"))]
+                {
+                    (&$field(val)).mul(&$r2)
+                }
             }
 
             /// Attempts to convert a little-endian byte representation of
@@ -63,6 +130,16 @@ macro_rules! field_common {
             pub fn to_bytes(&self) -> [u8; 32] {
                 <Self as ff::PrimeField>::to_repr(self)
             }
+
+            /// Lexicographic comparison of Montgomery forms.
+            #[inline(always)]
+            const fn is_less_than(x: &[u64; 4], y: &[u64; 4]) -> bool {
+                let (_, borrow) = sbb(x[0], y[0], 0);
+                let (_, borrow) = sbb(x[1], y[1], borrow);
+                let (_, borrow) = sbb(x[2], y[2], borrow);
+                let (_, borrow) = sbb(x[3], y[3], borrow);
+                borrow >> 63 == 1
+            }
         }
 
         impl Group for $field {
@@ -241,9 +318,9 @@ macro_rules! field_common {
             }
 
             fn get_lower_128(&self) -> u128 {
-                let tmp = $field::montgomery_reduce(
+                let tmp = $field::montgomery_reduce(&[
                     self.0[0], self.0[1], self.0[2], self.0[3], 0, 0, 0, 0,
-                );
+                ]);
 
                 u128::from(tmp.0[0]) | (u128::from(tmp.0[1]) << 64)
             }
@@ -345,7 +422,7 @@ macro_rules! field_arithmetic {
                 let (r6, carry) = mac(r6, self.0[3], self.0[3], carry);
                 let (r7, _) = adc(0, r7, carry);
 
-                $field::montgomery_reduce(r0, r1, r2, r3, r4, r5, r6, r7)
+                $field::montgomery_reduce(&[r0, r1, r2, r3, r4, r5, r6, r7])
             }
 
             /// Multiplies `rhs` by `self`, returning the result.
@@ -373,7 +450,7 @@ macro_rules! field_arithmetic {
                 let (r5, carry) = mac(r5, self.0[3], rhs.0[2], carry);
                 let (r6, r7) = mac(r6, self.0[3], rhs.0[3], carry);
 
-                $field::montgomery_reduce(r0, r1, r2, r3, r4, r5, r6, r7)
+                $field::montgomery_reduce(&[r0, r1, r2, r3, r4, r5, r6, r7])
             }
 
             /// Subtracts `rhs` from `self`, returning the result.
@@ -412,16 +489,6 @@ macro_rules! field_arithmetic {
 
                 $field([d0 & mask, d1 & mask, d2 & mask, d3 & mask])
             }
-
-            /// Lexicographic comparison of Montgomery forms.
-            #[inline(always)]
-            fn is_less_than(x: &[u64; 4], y: &[u64; 4]) -> bool {
-                let (_, borrow) = sbb(x[0], y[0], 0);
-                let (_, borrow) = sbb(x[1], y[1], borrow);
-                let (_, borrow) = sbb(x[2], y[2], borrow);
-                let (_, borrow) = sbb(x[3], y[3], borrow);
-                borrow >> 63 == 1
-            }
         }
     };
 }
@@ -443,49 +510,39 @@ macro_rules! field_specific {
                 (&$field([d0, d1, d2, d3])).sub(&$modulus)
             }
 
-            #[allow(clippy::too_many_arguments)]
             #[inline(always)]
-            pub(crate) const fn montgomery_reduce(
-                r0: u64,
-                r1: u64,
-                r2: u64,
-                r3: u64,
-                r4: u64,
-                r5: u64,
-                r6: u64,
-                r7: u64,
-            ) -> $field {
+            pub(crate) const fn montgomery_reduce(r: &[u64; 8]) -> $field {
                 // The Montgomery reduction here is based on Algorithm 14.32 in
                 // Handbook of Applied Cryptography
                 // <http://cacr.uwaterloo.ca/hac/about/chap14.pdf>.
 
-                let k = r0.wrapping_mul($inv);
-                let (_, carry) = mac(r0, k, $modulus.0[0], 0);
-                let (r1, carry) = mac(r1, k, $modulus.0[1], carry);
-                let (r2, carry) = mac(r2, k, $modulus.0[2], carry);
-                let (r3, carry) = mac(r3, k, $modulus.0[3], carry);
-                let (r4, carry2) = adc(r4, 0, carry);
+                let k = r[0].wrapping_mul($inv);
+                let (_, carry) = mac(r[0], k, $modulus.0[0], 0);
+                let (r1, carry) = mac(r[1], k, $modulus.0[1], carry);
+                let (r2, carry) = mac(r[2], k, $modulus.0[2], carry);
+                let (r3, carry) = mac(r[3], k, $modulus.0[3], carry);
+                let (r4, carry2) = adc(r[4], 0, carry);
 
                 let k = r1.wrapping_mul($inv);
                 let (_, carry) = mac(r1, k, $modulus.0[0], 0);
                 let (r2, carry) = mac(r2, k, $modulus.0[1], carry);
                 let (r3, carry) = mac(r3, k, $modulus.0[2], carry);
                 let (r4, carry) = mac(r4, k, $modulus.0[3], carry);
-                let (r5, carry2) = adc(r5, carry2, carry);
+                let (r5, carry2) = adc(r[5], carry2, carry);
 
                 let k = r2.wrapping_mul($inv);
                 let (_, carry) = mac(r2, k, $modulus.0[0], 0);
                 let (r3, carry) = mac(r3, k, $modulus.0[1], carry);
                 let (r4, carry) = mac(r4, k, $modulus.0[2], carry);
                 let (r5, carry) = mac(r5, k, $modulus.0[3], carry);
-                let (r6, carry2) = adc(r6, carry2, carry);
+                let (r6, carry2) = adc(r[6], carry2, carry);
 
                 let k = r3.wrapping_mul($inv);
                 let (_, carry) = mac(r3, k, $modulus.0[0], 0);
                 let (r4, carry) = mac(r4, k, $modulus.0[1], carry);
                 let (r5, carry) = mac(r5, k, $modulus.0[2], carry);
                 let (r6, carry) = mac(r6, k, $modulus.0[3], carry);
-                let (r7, _) = adc(r7, carry2, carry);
+                let (r7, _) = adc(r[7], carry2, carry);
 
                 // Result may be within MODULUS of the correct value
                 (&$field([r4, r5, r6, r7])).sub(&$modulus)
@@ -518,49 +575,39 @@ macro_rules! field_specific {
                 $field([d0, d1, d2, d3])
             }
 
-            #[allow(clippy::too_many_arguments)]
             #[inline(always)]
-            pub(crate) const fn montgomery_reduce(
-                r0: u64,
-                r1: u64,
-                r2: u64,
-                r3: u64,
-                r4: u64,
-                r5: u64,
-                r6: u64,
-                r7: u64,
-            ) -> Self {
+            pub(crate) const fn montgomery_reduce(r: &[u64; 8]) -> Self {
                 // The Montgomery reduction here is based on Algorithm 14.32 in
                 // Handbook of Applied Cryptography
                 // <http://cacr.uwaterloo.ca/hac/about/chap14.pdf>.
 
-                let k = r0.wrapping_mul($inv);
-                let (_, carry) = mac(r0, k, $modulus.0[0], 0);
-                let (r1, carry) = mac(r1, k, $modulus.0[1], carry);
-                let (r2, carry) = mac(r2, k, $modulus.0[2], carry);
-                let (r3, carry) = mac(r3, k, $modulus.0[3], carry);
-                let (r4, carry2) = adc(r4, 0, carry);
+                let k = r[0].wrapping_mul($inv);
+                let (_, carry) = mac(r[0], k, $modulus.0[0], 0);
+                let (r1, carry) = mac(r[1], k, $modulus.0[1], carry);
+                let (r2, carry) = mac(r[2], k, $modulus.0[2], carry);
+                let (r3, carry) = mac(r[3], k, $modulus.0[3], carry);
+                let (r4, carry2) = adc(r[4], 0, carry);
 
                 let k = r1.wrapping_mul($inv);
                 let (_, carry) = mac(r1, k, $modulus.0[0], 0);
                 let (r2, carry) = mac(r2, k, $modulus.0[1], carry);
                 let (r3, carry) = mac(r3, k, $modulus.0[2], carry);
                 let (r4, carry) = mac(r4, k, $modulus.0[3], carry);
-                let (r5, carry2) = adc(r5, carry2, carry);
+                let (r5, carry2) = adc(r[5], carry2, carry);
 
                 let k = r2.wrapping_mul($inv);
                 let (_, carry) = mac(r2, k, $modulus.0[0], 0);
                 let (r3, carry) = mac(r3, k, $modulus.0[1], carry);
                 let (r4, carry) = mac(r4, k, $modulus.0[2], carry);
                 let (r5, carry) = mac(r5, k, $modulus.0[3], carry);
-                let (r6, carry2) = adc(r6, carry2, carry);
+                let (r6, carry2) = adc(r[6], carry2, carry);
 
                 let k = r3.wrapping_mul($inv);
                 let (_, carry) = mac(r3, k, $modulus.0[0], 0);
                 let (r4, carry) = mac(r4, k, $modulus.0[1], carry);
                 let (r5, carry) = mac(r5, k, $modulus.0[2], carry);
                 let (r6, carry) = mac(r6, k, $modulus.0[3], carry);
-                let (r7, carry2) = adc(r7, carry2, carry);
+                let (r7, carry2) = adc(r[7], carry2, carry);
 
                 // Result may be within MODULUS of the correct value
                 let (d0, borrow) = sbb(r4, $modulus.0[0], 0);
diff --git a/src/secp256k1/fp.rs b/src/secp256k1/fp.rs
index 7135d75b..ffbf3a78 100644
--- a/src/secp256k1/fp.rs
+++ b/src/secp256k1/fp.rs
@@ -212,7 +212,7 @@ impl ff::PrimeField for Fp {
     fn to_repr(&self) -> Self::Repr {
         // Turn into canonical form by computing
         // (a.R) / R = a
-        let tmp = Fp::montgomery_reduce(self.0[0], self.0[1], self.0[2], self.0[3], 0, 0, 0, 0);
+        let tmp = Fp::montgomery_reduce(&[self.0[0], self.0[1], self.0[2], self.0[3], 0, 0, 0, 0]);
 
         let mut res = [0; 32];
         res[0..8].copy_from_slice(&tmp.0[0].to_le_bytes());
@@ -240,7 +240,7 @@ impl SqrtRatio for Fp {
     const T_MINUS1_OVER2: [u64; 4] = [0, 0, 0, 0];
 
     fn get_lower_32(&self) -> u32 {
-        let tmp = Fp::montgomery_reduce(self.0[0], self.0[1], self.0[2], self.0[3], 0, 0, 0, 0);
+        let tmp = Fp::montgomery_reduce(&[self.0[0], self.0[1], self.0[2], self.0[3], 0, 0, 0, 0]);
         tmp.0[0] as u32
     }
 }
diff --git a/src/secp256k1/fq.rs b/src/secp256k1/fq.rs
index 18ae6a5c..095f3482 100644
--- a/src/secp256k1/fq.rs
+++ b/src/secp256k1/fq.rs
@@ -237,7 +237,7 @@ impl ff::PrimeField for Fq {
     fn to_repr(&self) -> Self::Repr {
         // Turn into canonical form by computing
         // (a.R) / R = a
-        let tmp = Fq::montgomery_reduce(self.0[0], self.0[1], self.0[2], self.0[3], 0, 0, 0, 0);
+        let tmp = Fq::montgomery_reduce(&[self.0[0], self.0[1], self.0[2], self.0[3], 0, 0, 0, 0]);
 
         let mut res = [0; 32];
         res[0..8].copy_from_slice(&tmp.0[0].to_le_bytes());
@@ -270,7 +270,7 @@ impl SqrtRatio for Fq {
     ];
 
     fn get_lower_32(&self) -> u32 {
-        let tmp = Fq::montgomery_reduce(self.0[0], self.0[1], self.0[2], self.0[3], 0, 0, 0, 0);
+        let tmp = Fq::montgomery_reduce(&[self.0[0], self.0[1], self.0[2], self.0[3], 0, 0, 0, 0]);
         tmp.0[0] as u32
     }
 }