diff --git a/README.md b/README.md
index 354787f..d4cd779 100644
--- a/README.md
+++ b/README.md
@@ -163,10 +163,12 @@ Decoding performance for 48kHz stereo audio (full frame, CRC enabled):
 
 | Chip | Clock | 16-bit | 24-bit |
 | ---- | ----- | ------ | ------ |
-| ESP32-S3 | 240 MHz | ~25x realtime | ~17x realtime |
-| ESP32-P4 | 360 MHz | ~23x realtime | ~16x realtime |
+| ESP32 (internal SRAM) | 240 MHz | ~12x realtime | n/a |
+| ESP32 (PSRAM) | 240 MHz | ~8x realtime | n/a |
+| ESP32-S3 | 240 MHz | ~30x realtime | ~19x realtime |
+| ESP32-P4 | 360 MHz | ~25x realtime | ~18x realtime |
 
-Performance varies with block size, prediction order, and sample depth (24-bit requires 64-bit arithmetic). See [examples/decode_benchmark/README.md](examples/decode_benchmark/README.md) for detailed benchmarks, streaming overhead analysis, and instructions for running your own.
+ESP32-S3 and ESP32-P4 numbers are measured with the working buffer in PSRAM (the default); PSRAM is fast enough on these chips that switching to internal SRAM only saves ~2-4% on the S3 and well under 1% on the P4. On the original ESP32, PSRAM is much slower than internal SRAM, so placing the working buffer in internal memory (`CONFIG_MICRO_FLAC_PREFER_INTERNAL=y`) is roughly 30-35% faster and is recommended for performance-sensitive use. Performance also varies with block size, prediction order, and sample depth (24-bit requires 64-bit arithmetic). See [examples/decode_benchmark/README.md](examples/decode_benchmark/README.md) for detailed benchmarks, streaming overhead analysis, and instructions for running your own.
 
 ### Memory Usage
 
diff --git a/examples/decode_benchmark/README.md b/examples/decode_benchmark/README.md
index 629c218..ae0fd68 100644
--- a/examples/decode_benchmark/README.md
+++ b/examples/decode_benchmark/README.md
@@ -91,12 +91,12 @@ The benchmark runs each chunk size first with CRC disabled, then with CRC enable
                                 CRC Disabled           CRC Enabled
   Test Case              Time (ms) Real-time   Time (ms) Real-time
   --------------------  ---------- ---------  ---------- ---------
-  Full frame               1117.8     26.8x     1201.5     25.0x
-  1000 byte chunks         1122.1     26.7x     1206.7     24.9x
-  500 byte chunks          1127.6     26.6x     1212.8     24.7x
-  100 byte chunks          1171.5     25.6x     1260.4     23.8x
-  4 byte chunks            2473.3     12.1x     2642.9     11.4x
-  1 byte chunks            6769.5      4.4x     7208.0      4.2x
+  Full frame                918.26     32.7x      991.73     30.3x
+  1000 byte chunks          922.81     32.5x      997.24     30.1x
+  500 byte chunks           928.47     32.3x     1003.55     29.9x
+  100 byte chunks           975.27     30.8x     1053.93     28.5x
+  4 byte chunks            2373.16     12.6x     2527.86     11.9x
+  1 byte chunks            6935.53      4.3x     7296.88      4.1x
 ```
 
 ### ESP32-S3 @ 240 MHz (24-bit/48 kHz stereo, 30 seconds, packed 24-bit output)
@@ -109,12 +109,12 @@ The benchmark runs each chunk size first with CRC disabled, then with CRC enable
                                 CRC Disabled           CRC Enabled
   Test Case              Time (ms) Real-time   Time (ms) Real-time
   --------------------  ---------- ---------  ---------- ---------
-  Full frame               1622.7     18.5x     1810.0     16.6x
-  1000 byte chunks         1633.2     18.4x     1819.6     16.5x
-  500 byte chunks          1645.2     18.2x     1832.6     16.4x
-  100 byte chunks          1740.0     17.2x     1935.9     15.5x
-  4 byte chunks            4604.2      6.5x     4977.4      6.0x
-  1 byte chunks           13553.6      2.2x    14439.6      2.1x
+  Full frame               1385.14     21.7x     1550.19     19.4x
+  1000 byte chunks         1396.60     21.5x     1560.53     19.2x
+  500 byte chunks          1409.14     21.3x     1574.06     19.1x
+  100 byte chunks          1510.16     19.9x     1682.95     17.8x
+  4 byte chunks            4580.11      6.6x     4919.77      6.1x
+  1 byte chunks           14542.14      2.1x    15336.69      2.0x
 ```
 
 ### ESP32-S3 @ 240 MHz (24-bit/48 kHz stereo, 30 seconds, 32-bit output)
@@ -127,15 +127,15 @@ The benchmark runs each chunk size first with CRC disabled, then with CRC enable
                                 CRC Disabled           CRC Enabled
   Test Case              Time (ms) Real-time   Time (ms) Real-time
   --------------------  ---------- ---------  ---------- ---------
-  Full frame               1589.8     18.9x     1778.4     16.9x
-  1000 byte chunks         1601.2     18.7x     1787.8     16.8x
-  500 byte chunks          1613.2     18.6x     1800.9     16.7x
-  100 byte chunks          1707.6     17.6x     1903.3     15.8x
-  4 byte chunks            4555.4      6.6x     4928.5      6.1x
-  1 byte chunks           13455.1      2.2x    14341.4      2.1x
+  Full frame               1364.75     22.0x     1531.08     19.6x
+  1000 byte chunks         1376.54     21.8x     1541.01     19.5x
+  500 byte chunks          1389.18     21.6x     1554.69     19.3x
+  100 byte chunks          1489.55     20.1x     1662.72     18.0x
+  4 byte chunks            4538.67      6.6x     4878.53      6.1x
+  1 byte chunks           14435.03      2.1x    15229.60      2.0x
 ```
 
-Streaming with chunks of 100 bytes or larger has negligible overhead compared to full-frame decoding. CRC checking adds roughly 5-8% overhead for 16-bit and ~10-12% for 24-bit audio.
+Streaming with chunks of 100 bytes or larger has negligible overhead compared to full-frame decoding. CRC checking adds roughly ~8% overhead for 16-bit and ~12% for 24-bit audio.
 
 ## Interpreting Results
 
@@ -149,13 +149,16 @@ RTF = decode_time / audio_duration
 
 ### Expected Performance
 
-| Device | Clock | Bit depth | Expected RTF | Real-time |
-|--------|-------|-----------|--------------|-----------|
-| ESP32 | 240 MHz | 16-bit | 0.12-0.14 | 7-8x |
-| ESP32-S3 | 240 MHz | 16-bit | 0.037-0.040 | 25-27x |
-| ESP32-S3 | 240 MHz | 24-bit | 0.054-0.061 | 16-19x |
-| ESP32-P4 | 360 MHz | 16-bit | 0.042-0.044 | 23-24x |
-| ESP32-P4 | 360 MHz | 24-bit | 0.055-0.061 | 16-18x |
+| Device | Clock | Bit depth | Working buffer | Expected RTF | Real-time |
+|--------|-------|-----------|----------------|--------------|-----------|
+| ESP32 | 240 MHz | 16-bit | PSRAM | 0.107-0.131 | 7-9x |
+| ESP32 | 240 MHz | 16-bit | Internal | 0.079-0.087 | 11-13x |
+| ESP32-S3 | 240 MHz | 16-bit | PSRAM | 0.031-0.035 | 28-33x |
+| ESP32-S3 | 240 MHz | 24-bit | PSRAM | 0.046-0.056 | 18-22x |
+| ESP32-P4 | 360 MHz | 16-bit | PSRAM | 0.037-0.041 | 25-27x |
+| ESP32-P4 | 360 MHz | 24-bit | PSRAM | 0.050-0.058 | 17-20x |
+
+On the original ESP32, PSRAM access is much slower than internal SRAM, so placing the working buffer in internal memory (`CONFIG_MICRO_FLAC_PREFER_INTERNAL=y`) is roughly 30-35% faster. On the ESP32-S3, the same switch saves only ~2% (16-bit) to ~4% (24-bit), and on the ESP32-P4 it is below 1%. The S3/P4 numbers above are measured with the default PSRAM placement, and switching to internal SRAM yields essentially the same range.
 
 Performance varies based on:
 
diff --git a/include/micro_flac/flac_decoder.h b/include/micro_flac/flac_decoder.h
index 9a3011c..47d26fe 100644
--- a/include/micro_flac/flac_decoder.h
+++ b/include/micro_flac/flac_decoder.h
@@ -560,10 +560,12 @@ class FLACDecoder {
     /// @brief Read partition parameter and escape bits, advancing stage accordingly
     FLACDecoderResult read_partition_param(uint32_t block_size, uint32_t warm_up_samples);
 
-    /// @brief Read Rice-coded signed integer
-    /// @tparam Resuming  false = fresh read (hot path), true = resume after out-of-data
-    template <bool Resuming>
-    inline int32_t read_rice_sint(uint8_t param);
+    /// @brief Decode one non-escape Rice partition (out-of-lined on purpose).
+    /// Kept non-inline so the tight loop gets a clean register file, free of
+    /// pressure from the surrounding subframe state machine.
+    template <typename OutputT>
+    FLACDecoderResult decode_rice_partition(OutputT* out_ptr, uint8_t rice_param,
+                                            uint32_t partition_count);
 
     /// @brief Drain remaining unconsumed bytes from user buffer into bit_buffer_
     void drain_remaining_to_bit_buffer();
@@ -572,9 +574,6 @@ class FLACDecoder {
     // Bit Stream Reading
     // ========================================
 
-    /// @brief Refill bit buffer from input stream
-    inline bool refill_bit_buffer();
-
     /// @brief Read unsigned integer of specified bit width
     inline uint32_t read_uint(uint8_t num_bits);
 
diff --git a/src/README.md b/src/README.md
index e0a8045..66e9b08 100644
--- a/src/README.md
+++ b/src/README.md
@@ -14,7 +14,8 @@ Based on [Nayuki's Simple FLAC Implementation](https://www.nayuki.io/res/simple-
 
 ### Core Decoder
 
-- `flac_decoder.cpp` - Main decoder: state machine, container detection, header/metadata parsing, subframe decoding, residual decoding, bitstream reading
+- `flac_decoder.cpp` - Main decoder: state machine, container detection, header/metadata parsing, subframe decoding, residual decoding
+- `bit_reader.h` - Header-only bit-stream primitives: `BitReaderLocal` state struct plus `refill_bit_buffer_local()`, `read_uint_local()`, `read_rice_sint_local<Resuming>()`. Header-only so `FLAC_ALWAYS_INLINE` is honored at every call site
 - `frame_header.h` / `frame_header.cpp` - Frame header parsing: `compute_frame_header_length()`, `parse_frame_header()` (sync validation, field extraction, CRC-8 check, STREAMINFO validation)
 - `decorrelation.h` / `decorrelation.cpp` - Stereo channel decorrelation: `apply_channel_decorrelation()` for LEFT_SIDE, RIGHT_SIDE, and MID_SIDE joint stereo modes
 
@@ -125,7 +126,9 @@ After all subframes are decoded, channel decorrelation is applied via `apply_cha
 
 ### Bitstream Reading
 
-The decoder uses a platform-sized bit buffer: 64-bit on host/64-bit platforms (refilled 8 bytes at a time) and 32-bit on ESP32/32-bit platforms (refilled 4 bytes at a time). This avoids unnecessary 64-bit arithmetic on embedded targets while reducing refill frequency on desktop. Read functions are inlined.
+The bit-stream primitives live in `bit_reader.h` as header-only `FLAC_ALWAYS_INLINE` functions operating on a `BitReaderLocal` stack struct. Hoisting bit-reader state into a local struct lets the compiler keep it in registers across hot loops, avoiding aliasing-induced spills through the decoder's member fields.
+
+The decoder uses a platform-sized bit buffer: 64-bit on host/64-bit platforms (refilled 8 bytes at a time) and 32-bit on ESP32/32-bit platforms (refilled 4 bytes at a time). This avoids unnecessary 64-bit arithmetic on embedded targets while reducing refill frequency on desktop.
 
 ### LPC Accumulator Type Selection
 
diff --git a/src/bit_reader.h b/src/bit_reader.h
new file mode 100644
index 0000000..4c60e30
--- /dev/null
+++ b/src/bit_reader.h
@@ -0,0 +1,306 @@
+// Copyright 2026 Kevin Ahrendt
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "compiler.h"
+#include "micro_flac/flac_decoder.h"
+
+#include <cstddef>
+#include <cstdint>
+
+namespace micro_flac {
+
+// Generate a bitmask with num_bits set to 1 (e.g., num_bits=3 -> 0b111 = 7)
+// This replaces the UINT_MASK lookup table with bit manipulation for better performance
+static FLAC_ALWAYS_INLINE uint32_t uint_mask(uint32_t num_bits) {
+    return (num_bits >= 32) ? UINT32_MAX : ((1U << num_bits) - 1);
+}
+
+// Mask for bit buffer width (used to mask shift amounts)
+static constexpr uint32_t BIT_BUFFER_SHIFT_MASK = BIT_BUFFER_BITS - 1;
+
+// ============================================================================
+// Local bit-reader state for the Rice partition hot loop
+// ============================================================================
+//
+// Hoisting the decoder's bit-reader fields (bit_buffer_, bit_buffer_length_,
+// buffer_, buffer_index_, bytes_left_) into a stack-local struct lets GCC
+// promote them into registers across the sample loop. Without this, writes
+// through out_ptr can alias the member fields in GCC's view, causing it to
+// spill/reload bit_buffer_ (and others) on every iteration.
+//
+// These helpers are the single implementation of the bit-stream primitives.
+// FLACDecoder::read_uint is a thin wrapper that copies state between `this->`
+// and a stack-local struct around a call to read_uint_local.
+// Rice-coded residual reads (decode_rice_partition) use read_rice_sint_local
+// directly against a shared BitReaderLocal to avoid paying for an extra
+// store/load across the resume/hot-loop boundary.
+struct BitReaderLocal {
+    bit_buffer_t bit_buffer;
+    const uint8_t* buffer;
+    size_t buffer_index;
+    size_t bytes_left;
+    uint32_t bit_buffer_length;
+};
+
+// Refill the bit buffer from the input stream. Returns true on out-of-data
+// (zero bytes remaining); otherwise loads as many bytes as fit into
+// bit_buffer, advances buffer_index/bytes_left, and sets bit_buffer_length.
+//
+// ESP-IDF disables jump tables by default (-fno-jump-tables), so a switch
+// statement compiles to a chain of comparisons anyway. Using explicit
+// if/else with FLAC_LIKELY on the hot path lets the compiler prioritize it.
+//
+// All paths overwrite bit_buffer with only the newly loaded bytes. Old bits
+// are NOT preserved. This is safe because both callers handle old bits
+// before calling refill:
+//   - read_uint_local() extracts old bits into its local `result` before
+//     calling refill
+//   - read_rice_sint_local() only calls refill when bit_buffer_length == 0
+//     (no old bits)
+static FLAC_ALWAYS_INLINE bool refill_bit_buffer_local(BitReaderLocal& s) {
+#if (BIT_BUFFER_BITS == 64)
+    if (FLAC_LIKELY(s.bytes_left >= 8)) {
+        s.bit_buffer = (static_cast<uint64_t>(s.buffer[s.buffer_index]) << 56) |
+                       (static_cast<uint64_t>(s.buffer[s.buffer_index + 1]) << 48) |
+                       (static_cast<uint64_t>(s.buffer[s.buffer_index + 2]) << 40) |
+                       (static_cast<uint64_t>(s.buffer[s.buffer_index + 3]) << 32) |
+                       (static_cast<uint64_t>(s.buffer[s.buffer_index + 4]) << 24) |
+                       (static_cast<uint64_t>(s.buffer[s.buffer_index + 5]) << 16) |
+                       (static_cast<uint64_t>(s.buffer[s.buffer_index + 6]) << 8) |
+                       s.buffer[s.buffer_index + 7];
+        s.buffer_index += 8;
+        s.bytes_left -= 8;
+        s.bit_buffer_length = 64;
+        return false;
+    }
+    if (s.bytes_left == 7) {
+        s.bit_buffer = (static_cast<uint64_t>(s.buffer[s.buffer_index]) << 48) |
+                       (static_cast<uint64_t>(s.buffer[s.buffer_index + 1]) << 40) |
+                       (static_cast<uint64_t>(s.buffer[s.buffer_index + 2]) << 32) |
+                       (static_cast<uint64_t>(s.buffer[s.buffer_index + 3]) << 24) |
+                       (static_cast<uint64_t>(s.buffer[s.buffer_index + 4]) << 16) |
+                       (static_cast<uint64_t>(s.buffer[s.buffer_index + 5]) << 8) |
+                       s.buffer[s.buffer_index + 6];
+        s.buffer_index += 7;
+        s.bit_buffer_length = 56;
+        s.bytes_left = 0;
+        return false;
+    }
+    if (s.bytes_left == 6) {
+        s.bit_buffer = (static_cast<uint64_t>(s.buffer[s.buffer_index]) << 40) |
+                       (static_cast<uint64_t>(s.buffer[s.buffer_index + 1]) << 32) |
+                       (static_cast<uint64_t>(s.buffer[s.buffer_index + 2]) << 24) |
+                       (static_cast<uint64_t>(s.buffer[s.buffer_index + 3]) << 16) |
+                       (static_cast<uint64_t>(s.buffer[s.buffer_index + 4]) << 8) |
+                       s.buffer[s.buffer_index + 5];
+        s.buffer_index += 6;
+        s.bit_buffer_length = 48;
+        s.bytes_left = 0;
+        return false;
+    }
+    if (s.bytes_left == 5) {
+        s.bit_buffer = (static_cast<uint64_t>(s.buffer[s.buffer_index]) << 32) |
+                       (static_cast<uint64_t>(s.buffer[s.buffer_index + 1]) << 24) |
+                       (static_cast<uint64_t>(s.buffer[s.buffer_index + 2]) << 16) |
+                       (static_cast<uint64_t>(s.buffer[s.buffer_index + 3]) << 8) |
+                       s.buffer[s.buffer_index + 4];
+        s.buffer_index += 5;
+        s.bit_buffer_length = 40;
+        s.bytes_left = 0;
+        return false;
+    }
+    if (s.bytes_left == 4) {
+        s.bit_buffer = (static_cast<uint64_t>(s.buffer[s.buffer_index]) << 24) |
+                       (static_cast<uint64_t>(s.buffer[s.buffer_index + 1]) << 16) |
+                       (static_cast<uint64_t>(s.buffer[s.buffer_index + 2]) << 8) |
+                       s.buffer[s.buffer_index + 3];
+        s.buffer_index += 4;
+        s.bit_buffer_length = 32;
+        s.bytes_left = 0;
+        return false;
+    }
+#else
+    if (FLAC_LIKELY(s.bytes_left >= 4)) {
+        s.bit_buffer = (static_cast<uint32_t>(s.buffer[s.buffer_index]) << 24) |
+                       (static_cast<uint32_t>(s.buffer[s.buffer_index + 1]) << 16) |
+                       (static_cast<uint32_t>(s.buffer[s.buffer_index + 2]) << 8) |
+                       s.buffer[s.buffer_index + 3];
+        s.buffer_index += 4;
+        s.bytes_left -= 4;
+        s.bit_buffer_length = 32;
+        return false;
+    }
+#endif
+    if (s.bytes_left == 3) {
+        s.bit_buffer = (static_cast<bit_buffer_t>(s.buffer[s.buffer_index]) << 16) |
+                       (static_cast<bit_buffer_t>(s.buffer[s.buffer_index + 1]) << 8) |
+                       s.buffer[s.buffer_index + 2];
+        s.buffer_index += 3;
+        s.bit_buffer_length = 24;
+        s.bytes_left = 0;
+        return false;
+    }
+    if (s.bytes_left == 2) {
+        s.bit_buffer = (static_cast<bit_buffer_t>(s.buffer[s.buffer_index]) << 8) |
+                       s.buffer[s.buffer_index + 1];
+        s.buffer_index += 2;
+        s.bit_buffer_length = 16;
+        s.bytes_left = 0;
+        return false;
+    }
+    if (s.bytes_left == 1) {
+        s.bit_buffer = s.buffer[s.buffer_index];
+        s.buffer_index += 1;
+        s.bit_buffer_length = 8;
+        s.bytes_left = 0;
+        return false;
+    }
+    return true;
+}
+
+// Read an unsigned integer of `num_bits` bits using only the local state
+// struct. On out-of-data, sets `out_of_data=true` and returns 0 without
+// consuming any bits (bit_buffer/bit_buffer_length are left untouched in
+// that case, matching the member read_uint contract).
+static FLAC_ALWAYS_INLINE uint32_t read_uint_local(BitReaderLocal& s, uint8_t num_bits,
+                                                   bool& out_of_data) {
+    uint32_t result = 0;
+
+    if (num_bits > s.bit_buffer_length) {
+        const uint32_t new_bits_needed = num_bits - s.bit_buffer_length;
+        const size_t bytes_needed = (new_bits_needed + 7) / 8;
+
+        if (FLAC_UNLIKELY(s.bytes_left < bytes_needed)) {
+            out_of_data = true;
+            return 0;
+        }
+
+        if (new_bits_needed < BIT_BUFFER_BITS) {
+            // Some of the current bits will be used in the result
+            result = static_cast<uint32_t>(s.bit_buffer << new_bits_needed);
+        }
+
+        refill_bit_buffer_local(s);
+        s.bit_buffer_length = s.bit_buffer_length - new_bits_needed;
+    } else {
+        s.bit_buffer_length -= num_bits;
+    }
+
+    result |= static_cast<uint32_t>(s.bit_buffer >> (s.bit_buffer_length & BIT_BUFFER_SHIFT_MASK));
+    result &= uint_mask(num_bits);
+    return result;
+}
+
+// Read a Rice-coded signed integer using only the local state struct.
+// `mask` must equal `(1U << param) - 1` and is passed explicitly so the
+// caller can pin it to a register across the partition loop (the inlined
+// uint_mask otherwise tends to spill to the stack).
+// On out-of-data, sets *out_of_data=true and writes partial-progress info
+// into *unary_count_out and *binary_pending_out so the caller can persist
+// it into the decoder's RiceState for resume.
+//
+// Template param `Resuming`:
+//   false (hot path): start unary_count=0 and always run the unary phase.
+//   true             : seed unary_count from `unary_count_in`, and skip the
+//                      unary phase when `binary_pending_in` is true (the
+//                      unary phase already completed on a prior call).
+// With FLAC_ALWAYS_INLINE, the `Resuming` branches fold away at both call
+// sites, so the hot-path instruction sequence is identical to the previous
+// non-templated helper.
+template <bool Resuming = false>
+static FLAC_ALWAYS_INLINE int32_t read_rice_sint_local(
+    BitReaderLocal& s, uint8_t param, uint32_t mask, bool* out_of_data, uint32_t* unary_count_out,
+    bool* binary_pending_out, uint32_t unary_count_in = 0, bool binary_pending_in = false) {
+    uint32_t unary_count = Resuming ? unary_count_in : 0;
+    if (!Resuming || !binary_pending_in) {
+        while (true) {
+            if (s.bit_buffer_length == 0) {
+                if (FLAC_UNLIKELY(refill_bit_buffer_local(s))) {
+                    *unary_count_out = unary_count;
+                    *binary_pending_out = false;
+                    *out_of_data = true;
+                    return 0;
+                }
+            }
+            bit_buffer_t shifted = s.bit_buffer << (BIT_BUFFER_BITS - s.bit_buffer_length);
+            if (FLAC_UNLIKELY(shifted == 0)) {
+                unary_count += s.bit_buffer_length;
+                s.bit_buffer_length = 0;
+                continue;
+            }
+            uint32_t leading_zeros = static_cast<uint32_t>(FLAC_CLZ(shifted));
+            unary_count += leading_zeros;
+            s.bit_buffer_length = s.bit_buffer_length - (leading_zeros + 1);
+            break;
+        }
+    }
+
+    // Rice parameter is structurally bounded to < 32 by the FLAC spec
+    // (RFC 9639 §9.2.7): the partition parameter is read as a 4-bit or 5-bit
+    // field (residual coding method 0 or 1), giving max values of 15 and 31
+    // respectively. For method 1, value 31 is the escape marker; this
+    // function is only invoked from the non-escape branch, so param is
+    // guaranteed <= 30. Even with corrupted input, read_uint(N) physically
+    // cannot return more than 2^N - 1, so the invariant holds regardless.
+    //
+    // Hinting this to the compiler lets it drop the `num_bits >= 32` guard
+    // inside uint_mask / read_uint_local, which in turn lets register
+    // allocation keep `buffer_` pinned in a register (previously occupied by
+    // the constant 31) and eliminates a per-sample conditional mask reload.
+    //
+    // Safety note: this assumption is also relied on by `(unary_count << param)`
+    // below, which is UB for param >= 32. We are making an already-required
+    // invariant explicit to the optimizer, not introducing a new one.
+    FLAC_ASSUME(param < 32);
+
+    // Binary phase. Inline the fast path so the precomputed `mask` is used
+    // directly instead of going through uint_mask (which, even with the
+    // unreachable hint above, tends to spill the computed mask to the stack).
+    uint32_t binary = 0;
+    if (FLAC_LIKELY(param <= s.bit_buffer_length)) {
+        s.bit_buffer_length = s.bit_buffer_length - param;
+        binary =
+            static_cast<uint32_t>(s.bit_buffer >> (s.bit_buffer_length & BIT_BUFFER_SHIFT_MASK)) &
+            mask;
+    } else {
+        // Slow path: binary field straddles a refill. Inline read_uint here so
+        // we can reuse the caller-provided `mask` instead of rebuilding it via
+        // uint_mask (which tends to spill). Since param < 32 and
+        // bit_buffer_length >= 0, new_bits_needed is in [1, 31] (always
+        // strictly less than BIT_BUFFER_BITS on both 32- and 64-bit hosts), so
+        // we can skip the read_uint_local `new_bits_needed >= BIT_BUFFER_BITS`
+        // edge case.
+        const uint32_t new_bits_needed = param - s.bit_buffer_length;
+        const size_t bytes_needed = (new_bits_needed + 7) / 8;
+        if (FLAC_UNLIKELY(s.bytes_left < bytes_needed)) {
+            *unary_count_out = unary_count;
+            *binary_pending_out = true;
+            *out_of_data = true;
+            return 0;
+        }
+        const uint32_t high = static_cast<uint32_t>(s.bit_buffer << new_bits_needed);
+        refill_bit_buffer_local(s);
+        s.bit_buffer_length = s.bit_buffer_length - new_bits_needed;
+        binary = (high | static_cast<uint32_t>(s.bit_buffer >>
+                                               (s.bit_buffer_length & BIT_BUFFER_SHIFT_MASK))) &
+                 mask;
+    }
+
+    uint32_t value = (unary_count << param) | binary;
+    return static_cast<int32_t>((value >> 1) ^ -(value & 1));
+}
+
+}  // namespace micro_flac
diff --git a/src/compiler.h b/src/compiler.h
index 7ca9d50..2d16ab1 100644
--- a/src/compiler.h
+++ b/src/compiler.h
@@ -45,6 +45,17 @@
 #define FLAC_HOT
 #endif
 
+// Prevent inlining. Useful for extracting a tight loop into its own function
+// so the compiler can allocate registers for it without pressure from a
+// surrounding large switch/state machine.
+#if defined(__GNUC__) || defined(__clang__)
+#define FLAC_NOINLINE __attribute__((noinline))
+#elif defined(_MSC_VER)
+#define FLAC_NOINLINE __declspec(noinline)
+#else
+#define FLAC_NOINLINE
+#endif
+
 // Branch prediction hints
 #if defined(__GNUC__) || defined(__clang__)
 #define FLAC_LIKELY(x) __builtin_expect(!!(x), 1)
@@ -54,6 +65,33 @@
 #define FLAC_UNLIKELY(x) (x)
 #endif
 
+// Hint to the optimizer that an expression is true. Used to prune impossible
+// branches and tighten codegen on the hot path. Clang has `__builtin_assume`;
+// GCC lacks a direct equivalent, so fall back to the
+// `if (!x) __builtin_unreachable();` idiom (which evaluates `x`, unlike
+// __builtin_assume — keep `x` side-effect free).
+#if defined(__clang__)
+#define FLAC_ASSUME(x) __builtin_assume(x)
+#elif defined(__GNUC__)
+#define FLAC_ASSUME(x)               \
+    do {                             \
+        if (!(x))                    \
+            __builtin_unreachable(); \
+    } while (0)
+#else
+#define FLAC_ASSUME(x) ((void)0)
+#endif
+
+// Silence UBSan signed-integer-overflow in LPC restore functions.
+// FLAC LPC prediction intentionally uses wrapping int32_t arithmetic;
+// overflows in this path are audio-only and not a security concern.
+// Same approach as libFLAC (FLAC__lpc_restore_signal).
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && (defined(__GNUC__) || defined(__clang__))
+#define FLAC_NO_SANITIZE_OVERFLOW __attribute__((no_sanitize("signed-integer-overflow")))
+#else
+#define FLAC_NO_SANITIZE_OVERFLOW
+#endif
+
 #include <cstdint>
 
 namespace micro_flac {
diff --git a/src/crc.cpp b/src/crc.cpp
index 94479ab..a9193c2 100644
--- a/src/crc.cpp
+++ b/src/crc.cpp
@@ -250,28 +250,48 @@ uint8_t calculate_crc8(const uint8_t* data, size_t len) {
 }
 
 FLAC_OPTIMIZE_O3
-uint16_t update_crc16(uint16_t crc, const uint8_t* data, size_t len) {
+uint16_t update_crc16(uint16_t crc_in, const uint8_t* data, size_t len) {
     const uint8_t* end = data + len;
+    // Hold CRC in a 32-bit local so GCC doesn't insert a per-iteration
+    // `extui crc, crc, 0, 16` to honor uint16_t semantics. We only read bits
+    // [15:8] via a bit-field extract and XOR-in a 16-bit table value, so the
+    // upper bits are harmless until we narrow on return.
+    uint32_t crc = crc_in;
+
+    const uint8_t* end8 = data + (len & ~static_cast<size_t>(7));  // Round down to multiple of 8
 
 #if UINTPTR_MAX != 0xFFFFFFFF
     // On 64-bit hosts, process 8 bytes at a time using slicing-by-8
-    const uint8_t* end8 = data + (len & ~7U);  // Round down to multiple of 8
-
     while (data < end8) {
-        crc = static_cast<uint16_t>(
-            CRC16_TABLE_7[(crc >> 8) ^ data[0]] ^ CRC16_TABLE_6[(crc & 0xFF) ^ data[1]] ^
-            CRC16_TABLE_5[data[2]] ^ CRC16_TABLE_4[data[3]] ^ CRC16_TABLE_3[data[4]] ^
-            CRC16_TABLE_2[data[5]] ^ CRC16_TABLE_1[data[6]] ^ CRC16_TABLE_0[data[7]]);
+        crc = CRC16_TABLE_7[((crc >> 8) & 0xFF) ^ data[0]] ^ CRC16_TABLE_6[(crc & 0xFF) ^ data[1]] ^
+              CRC16_TABLE_5[data[2]] ^ CRC16_TABLE_4[data[3]] ^ CRC16_TABLE_3[data[4]] ^
+              CRC16_TABLE_2[data[5]] ^ CRC16_TABLE_1[data[6]] ^ CRC16_TABLE_0[data[7]];
+        data += 8;
+    }
+#else
+    // On 32-bit hosts, process 8 bytes per iteration using only TABLE_0.
+    // Amortizes the pointer advance and loop overhead across 8 bytes
+    // (7 + 1/8 = 7.125 insts/byte vs 8 for byte-at-a-time). Uses only
+    // TABLE_0 so no extra cache/flash pressure from additional tables.
+    while (data < end8) {
+        crc = (crc << 8) ^ CRC16_TABLE_0[((crc >> 8) & 0xFF) ^ data[0]];
+        crc = (crc << 8) ^ CRC16_TABLE_0[((crc >> 8) & 0xFF) ^ data[1]];
+        crc = (crc << 8) ^ CRC16_TABLE_0[((crc >> 8) & 0xFF) ^ data[2]];
+        crc = (crc << 8) ^ CRC16_TABLE_0[((crc >> 8) & 0xFF) ^ data[3]];
+        crc = (crc << 8) ^ CRC16_TABLE_0[((crc >> 8) & 0xFF) ^ data[4]];
+        crc = (crc << 8) ^ CRC16_TABLE_0[((crc >> 8) & 0xFF) ^ data[5]];
+        crc = (crc << 8) ^ CRC16_TABLE_0[((crc >> 8) & 0xFF) ^ data[6]];
+        crc = (crc << 8) ^ CRC16_TABLE_0[((crc >> 8) & 0xFF) ^ data[7]];
         data += 8;
     }
 #endif  // UINTPTR_MAX != 0xFFFFFFFF
 
-    // Byte-at-a-time (handles remaining bytes on 64-bit, all bytes on 32-bit)
+    // Byte-at-a-time (handles trailing bytes 0..7)
     while (data < end) {
-        crc = static_cast<uint16_t>((crc << 8) ^ CRC16_TABLE_0[(crc >> 8) ^ *data++]);
+        crc = (crc << 8) ^ CRC16_TABLE_0[((crc >> 8) & 0xFF) ^ *data++];
     }
 
-    return crc;
+    return static_cast<uint16_t>(crc);
 }
 
 }  // namespace micro_flac
diff --git a/src/decorrelation.cpp b/src/decorrelation.cpp
index 4be6243..e246a1c 100644
--- a/src/decorrelation.cpp
+++ b/src/decorrelation.cpp
@@ -55,8 +55,9 @@ void apply_channel_decorrelation(int32_t* block_samples, const SideT* side_chann
         for (; i < block_size; i++) {
             block_samples[i] = wadd32(side_channel[i], block_samples[block_size + i]);
         }
-    } else if (channel_assignment == CHANNEL_MID_SIDE) {
-        // MID_SIDE: Left = Mid + Side/2, Right = Mid - Side/2
+    } else {
+        // MID_SIDE (caller guarantees channel_assignment is 8, 9, or 10).
+        // Left = Mid + Side/2, Right = Mid - Side/2
         // Arithmetic right shift for side>>1 is required by the FLAC spec.
         // With SideT=int64_t, the shift operates on the full 33-bit value.
         // Process 4 samples at a time
diff --git a/src/flac_decoder.cpp b/src/flac_decoder.cpp
index 9f76ca3..ca9398a 100644
--- a/src/flac_decoder.cpp
+++ b/src/flac_decoder.cpp
@@ -15,6 +15,7 @@
 #include "micro_flac/flac_decoder.h"
 
 #include "alloc.h"
+#include "bit_reader.h"
 #include "compiler.h"
 #include "crc.h"
 #include "decorrelation.h"
@@ -55,14 +56,8 @@ static constexpr const int32_t* FIXED_COEFFICIENTS[] = {nullptr, FIXED_COEFFICIE
                                                         FIXED_COEFFICIENTS_2, FIXED_COEFFICIENTS_3,
                                                         FIXED_COEFFICIENTS_4};
 
-// Generate a bitmask with num_bits set to 1 (e.g., num_bits=3 -> 0b111 = 7)
-// This replaces the UINT_MASK lookup table with bit manipulation for better performance
-static FLAC_ALWAYS_INLINE uint32_t uint_mask(uint32_t num_bits) {
-    return (num_bits >= 32) ? UINT32_MAX : ((1U << num_bits) - 1);
-}
-
-// Mask for bit buffer width (used to mask shift amounts)
-static constexpr uint32_t BIT_BUFFER_SHIFT_MASK = BIT_BUFFER_BITS - 1;
+// The bit-reader primitives (BitReaderLocal, refill_bit_buffer_local,
+// read_uint_local, read_rice_sint_local) live in bit_reader.h.
 
 // ============================================================================
 // FLACMetadataBlock Lifecycle
@@ -769,7 +764,7 @@ FLAC_HOT FLACDecoderResult FLACDecoder::decode_frame(const uint8_t* buffer, size
     return FLAC_DECODER_ERROR_INTERNAL;
 }
 
-FLAC_HOT void FLACDecoder::reset_frame_state() {
+void FLACDecoder::reset_frame_state() {
     this->frame_ = FrameState{};
     this->subframe_ = SubframeState{};
     this->residual_ = ResidualState{};
@@ -1323,39 +1318,12 @@ FLAC_OPTIMIZE_O3 FLACDecoderResult FLACDecoder::decode_residuals(OutputT* sub_fr
                     }
                     this->residual_.sample_idx = sample_idx_e;
                 } else {
-                    // Residuals are always decoded as int32_t even on the int64_t (wide-side)
-                    // path. Per RFC 9639 §9.2.7.3, Rice-coded residuals must fit in a signed
-                    // 32-bit two's complement integer (excluding INT32_MIN), so int32_t is
-                    // sufficient regardless of the output sample type.
-
-                    // Hoist struct fields into locals for the hot loop.
-                    // This lets the compiler keep them in registers instead
-                    // of reloading from memory on every iteration.
-                    uint32_t sample_idx = this->residual_.sample_idx;
-                    const uint32_t partition_count = this->residual_.partition_count;
                     const uint8_t rice_param = static_cast<uint8_t>(this->residual_.param);
-
-                    // If resuming mid-rice-read, finish that one sample first
-                    if (this->rice_.pending) {
-                        int32_t val = this->read_rice_sint<true>(rice_param);
-                        if (FLAC_UNLIKELY(this->out_of_data_)) {
-                            this->residual_.sample_idx = sample_idx;
-                            return FLAC_DECODER_NEED_MORE_DATA;
-                        }
-                        out_ptr[sample_idx] = val;
-                        sample_idx++;
-                        this->rice_.pending = false;
-                    }
-                    for (; sample_idx < partition_count; sample_idx++) {
-                        int32_t val = this->read_rice_sint<false>(rice_param);
-                        if (FLAC_UNLIKELY(this->out_of_data_)) {
-                            this->residual_.sample_idx = sample_idx;
-                            this->rice_.pending = true;
-                            return FLAC_DECODER_NEED_MORE_DATA;
-                        }
-                        out_ptr[sample_idx] = val;
+                    FLACDecoderResult ret = this->decode_rice_partition<OutputT>(
+                        out_ptr, rice_param, this->residual_.partition_count);
+                    if (ret != FLAC_DECODER_SUCCESS) {
+                        return ret;
                     }
-                    this->residual_.sample_idx = sample_idx;
                 }
 
                 this->residual_.out_ptr_offset += this->residual_.partition_count;
@@ -1379,6 +1347,100 @@ FLAC_OPTIMIZE_O3 FLACDecoderResult FLACDecoder::decode_residuals(OutputT* sub_fr
 template FLACDecoderResult FLACDecoder::decode_residuals<int32_t>(int32_t*, uint32_t, uint32_t);
 template FLACDecoderResult FLACDecoder::decode_residuals<int64_t>(int64_t*, uint32_t, uint32_t);
 
+// Non-inline so the hot Rice sample loop gets a fresh register file. The
+// surrounding subframe state machine holds many live values; isolating this
+// loop in its own function lets the compiler allocate registers just for
+// bit-reader state + partition locals.
+template <typename OutputT>
+FLAC_HOT FLAC_NOINLINE FLACDecoderResult
+FLACDecoder::decode_rice_partition(OutputT* out_ptr, uint8_t rice_param, uint32_t partition_count) {
+    // Residuals are always decoded as int32_t even on the int64_t (wide-side) path.
+    // Per RFC 9639 §9.2.7.3, Rice-coded residuals must fit in a signed 32-bit two's
+    // complement integer (excluding INT32_MIN), so int32_t is sufficient regardless
+    // of the output sample type.
+    uint32_t sample_idx = this->residual_.sample_idx;
+
+    // Hoist bit-reader state into a stack-local struct so the compiler can
+    // keep it in registers across the loop instead of reloading through this->
+    // on every iteration. Shared between the resume prefix and the hot loop
+    // so we don't pay an extra store/load around the resume.
+    BitReaderLocal st{};
+    st.bit_buffer = this->bit_buffer_;
+    st.bit_buffer_length = this->bit_buffer_length_;
+    st.buffer = this->buffer_;
+    st.buffer_index = this->buffer_index_;
+    st.bytes_left = this->bytes_left_;
+
+    bool oo_d = false;
+    uint32_t unary_pending = 0;
+    bool binary_pending = false;
+
+    // Precompute the Rice mask and try to pin it to a register across the loop.
+    // On Xtensa, GCC otherwise spills the mask to the stack and reloads it
+    // inside the inlined binary extraction on every sample (an extra l32i per
+    // residual on the hot path). Binding to a15m, the last callee-saved
+    // general-purpose register in the current window, keeps it live without
+    // displacing the bit-reader state that's already hot in registers.
+#if defined(__xtensa__) || defined(__XTENSA__)
+    register uint32_t rice_mask asm("a15") = (static_cast<uint32_t>(1) << rice_param) - 1;
+#else
+    const uint32_t rice_mask = (static_cast<uint32_t>(1) << rice_param) - 1;
+#endif
+
+    // If resuming mid-rice-read, finish that one sample on the resume path
+    // before entering the non-resuming hot loop. Rice parameter is < 32
+    // (see note inside read_rice_sint_local); the assert here guards that
+    // invariant locally.
+    if (FLAC_UNLIKELY(this->rice_.pending)) {
+        assert(rice_param < 32);
+        int32_t val = read_rice_sint_local<true>(st, rice_param, rice_mask, &oo_d, &unary_pending,
+                                                 &binary_pending, this->rice_.unary_count,
+                                                 this->rice_.binary_pending);
+        if (FLAC_UNLIKELY(oo_d)) {
+            this->bit_buffer_ = st.bit_buffer;
+            this->bit_buffer_length_ = static_cast<uint8_t>(st.bit_buffer_length);
+            this->buffer_index_ = st.buffer_index;
+            this->bytes_left_ = st.bytes_left;
+            this->out_of_data_ = true;
+            this->rice_.unary_count = unary_pending;
+            this->rice_.binary_pending = binary_pending;
+            this->residual_.sample_idx = sample_idx;
+            return FLAC_DECODER_NEED_MORE_DATA;
+        }
+        out_ptr[sample_idx] = val;
+        sample_idx++;
+        this->rice_.pending = false;
+    }
+
+    for (; sample_idx < partition_count; sample_idx++) {
+        int32_t val =
+            read_rice_sint_local(st, rice_param, rice_mask, &oo_d, &unary_pending, &binary_pending);
+        if (FLAC_UNLIKELY(oo_d)) {
+            this->bit_buffer_ = st.bit_buffer;
+            this->bit_buffer_length_ = static_cast<uint8_t>(st.bit_buffer_length);
+            this->buffer_index_ = st.buffer_index;
+            this->bytes_left_ = st.bytes_left;
+            this->out_of_data_ = true;
+            this->rice_.unary_count = unary_pending;
+            this->rice_.binary_pending = binary_pending;
+            this->rice_.pending = true;
+            this->residual_.sample_idx = sample_idx;
+            return FLAC_DECODER_NEED_MORE_DATA;
+        }
+        out_ptr[sample_idx] = val;
+    }
+
+    this->bit_buffer_ = st.bit_buffer;
+    this->bit_buffer_length_ = static_cast<uint8_t>(st.bit_buffer_length);
+    this->buffer_index_ = st.buffer_index;
+    this->bytes_left_ = st.bytes_left;
+    this->residual_.sample_idx = sample_idx;
+    return FLAC_DECODER_SUCCESS;
+}
+
+template FLACDecoderResult FLACDecoder::decode_rice_partition<int32_t>(int32_t*, uint8_t, uint32_t);
+template FLACDecoderResult FLACDecoder::decode_rice_partition<int64_t>(int64_t*, uint8_t, uint32_t);
+
 FLAC_HOT FLACDecoderResult FLACDecoder::read_partition_param(uint32_t block_size,
                                                              uint32_t warm_up_samples) {
     if (this->subframe_.stage == SubframeDecodeStage::RESIDUAL_PARTITION_PARAM) {
@@ -1417,54 +1479,6 @@ FLAC_HOT FLACDecoderResult FLACDecoder::read_partition_param(uint32_t block_size
     return FLAC_DECODER_SUCCESS;
 }
 
-template <bool Resuming>
-FLAC_ALWAYS_INLINE int32_t FLACDecoder::read_rice_sint(uint8_t param) {
-    uint32_t unary_count = Resuming ? this->rice_.unary_count : 0;
-
-    if (!Resuming || !this->rice_.binary_pending) {
-        // Unary phase: count leading zeros
-        while (true) {
-            if (this->bit_buffer_length_ == 0) {
-                if (FLAC_UNLIKELY(this->refill_bit_buffer())) {
-                    this->rice_.unary_count = unary_count;
-                    this->rice_.binary_pending = false;
-                    this->out_of_data_ = true;
-                    return 0;
-                }
-            }
-
-            bit_buffer_t shifted_buffer = this->bit_buffer_
-                                          << (BIT_BUFFER_BITS - this->bit_buffer_length_);
-
-            if (FLAC_UNLIKELY(shifted_buffer == 0)) {
-                unary_count += this->bit_buffer_length_;
-                this->bit_buffer_length_ = 0;
-                continue;
-            }
-
-            uint32_t leading_zeros = static_cast<uint32_t>(FLAC_CLZ(shifted_buffer));
-            unary_count += leading_zeros;
-            this->bit_buffer_length_ =
-                static_cast<uint8_t>(this->bit_buffer_length_ - (leading_zeros + 1));
-            break;
-        }
-    }
-
-    // Binary phase: read rice parameter bits
-    uint32_t binary = this->read_uint(param);
-    if (FLAC_UNLIKELY(this->out_of_data_)) {
-        this->rice_.unary_count = unary_count;
-        this->rice_.binary_pending = true;
-        return 0;
-    }
-
-    // Rice parameter is at most 30 (5-bit param, with 31 reserved as escape code),
-    // so shifting a uint32_t left by param is always well-defined.
-    assert(param < 32);
-    uint32_t value = (unary_count << param) | binary;
-    return static_cast<int32_t>((value >> 1) ^ -(value & 1));
-}
-
 void FLACDecoder::drain_remaining_to_bit_buffer() {
     // Drain unconsumed bytes from user's buffer into bit_buffer_.
     // Safe because: when read_uint fails, bit_buffer_length_ + 8*bytes_left_ < BIT_BUFFER_BITS
@@ -1480,146 +1494,24 @@ void FLACDecoder::drain_remaining_to_bit_buffer() {
 // Bit Stream Reading
 // ============================================================================
 
-FLAC_ALWAYS_INLINE bool FLACDecoder::refill_bit_buffer() {
-    // ESP-IDF disables jump tables by default (-fno-jump-tables), so a switch statement
-    // compiles to a chain of comparisons anyway. Using explicit if/else with FLAC_LIKELY
-    // on the hot path lets the compiler prioritize it.
-    //
-    // All paths overwrite bit_buffer_ with only the newly loaded bytes. Old bits are NOT
-    // preserved. This is safe because both callers handle old bits before calling refill:
-    //   - read_uint() extracts old bits into its local `result` before calling refill
-    //   - read_rice_sint() only calls refill when bit_buffer_length_ == 0 (no old bits)
-#if (BIT_BUFFER_BITS == 64)
-    if (FLAC_LIKELY(this->bytes_left_ >= 8)) {
-        // 8 or more bytes available: load 8 bytes big-endian
-        this->bit_buffer_ = (static_cast<uint64_t>(this->buffer_[this->buffer_index_]) << 56) |
-                            (static_cast<uint64_t>(this->buffer_[this->buffer_index_ + 1]) << 48) |
-                            (static_cast<uint64_t>(this->buffer_[this->buffer_index_ + 2]) << 40) |
-                            (static_cast<uint64_t>(this->buffer_[this->buffer_index_ + 3]) << 32) |
-                            (static_cast<uint64_t>(this->buffer_[this->buffer_index_ + 4]) << 24) |
-                            (static_cast<uint64_t>(this->buffer_[this->buffer_index_ + 5]) << 16) |
-                            (static_cast<uint64_t>(this->buffer_[this->buffer_index_ + 6]) << 8) |
-                            this->buffer_[this->buffer_index_ + 7];
-        this->buffer_index_ += 8;
-        this->bytes_left_ -= 8;
-        this->bit_buffer_length_ = 64;
-        return false;
-    }
-    if (this->bytes_left_ == 7) {
-        this->bit_buffer_ = (static_cast<uint64_t>(this->buffer_[this->buffer_index_]) << 48) |
-                            (static_cast<uint64_t>(this->buffer_[this->buffer_index_ + 1]) << 40) |
-                            (static_cast<uint64_t>(this->buffer_[this->buffer_index_ + 2]) << 32) |
-                            (static_cast<uint64_t>(this->buffer_[this->buffer_index_ + 3]) << 24) |
-                            (static_cast<uint64_t>(this->buffer_[this->buffer_index_ + 4]) << 16) |
-                            (static_cast<uint64_t>(this->buffer_[this->buffer_index_ + 5]) << 8) |
-                            this->buffer_[this->buffer_index_ + 6];
-        this->buffer_index_ += 7;
-        this->bit_buffer_length_ = 56;
-        this->bytes_left_ = 0;
-        return false;
-    }
-    if (this->bytes_left_ == 6) {
-        this->bit_buffer_ = (static_cast<uint64_t>(this->buffer_[this->buffer_index_]) << 40) |
-                            (static_cast<uint64_t>(this->buffer_[this->buffer_index_ + 1]) << 32) |
-                            (static_cast<uint64_t>(this->buffer_[this->buffer_index_ + 2]) << 24) |
-                            (static_cast<uint64_t>(this->buffer_[this->buffer_index_ + 3]) << 16) |
-                            (static_cast<uint64_t>(this->buffer_[this->buffer_index_ + 4]) << 8) |
-                            this->buffer_[this->buffer_index_ + 5];
-        this->buffer_index_ += 6;
-        this->bit_buffer_length_ = 48;
-        this->bytes_left_ = 0;
-        return false;
-    }
-    if (this->bytes_left_ == 5) {
-        this->bit_buffer_ = (static_cast<uint64_t>(this->buffer_[this->buffer_index_]) << 32) |
-                            (static_cast<uint64_t>(this->buffer_[this->buffer_index_ + 1]) << 24) |
-                            (static_cast<uint64_t>(this->buffer_[this->buffer_index_ + 2]) << 16) |
-                            (static_cast<uint64_t>(this->buffer_[this->buffer_index_ + 3]) << 8) |
-                            this->buffer_[this->buffer_index_ + 4];
-        this->buffer_index_ += 5;
-        this->bit_buffer_length_ = 40;
-        this->bytes_left_ = 0;
-        return false;
-    }
-    if (this->bytes_left_ == 4) {
-        this->bit_buffer_ = (static_cast<uint64_t>(this->buffer_[this->buffer_index_]) << 24) |
-                            (static_cast<uint64_t>(this->buffer_[this->buffer_index_ + 1]) << 16) |
-                            (static_cast<uint64_t>(this->buffer_[this->buffer_index_ + 2]) << 8) |
-                            this->buffer_[this->buffer_index_ + 3];
-        this->buffer_index_ += 4;
-        this->bit_buffer_length_ = 32;
-        this->bytes_left_ = 0;
-        return false;
-    }
-#else
-    if (FLAC_LIKELY(this->bytes_left_ >= 4)) {
-        // 4 or more bytes available: load 4 bytes big-endian
-        this->bit_buffer_ = (static_cast<uint32_t>(this->buffer_[this->buffer_index_]) << 24) |
-                            (static_cast<uint32_t>(this->buffer_[this->buffer_index_ + 1]) << 16) |
-                            (static_cast<uint32_t>(this->buffer_[this->buffer_index_ + 2]) << 8) |
-                            this->buffer_[this->buffer_index_ + 3];
-        this->buffer_index_ += 4;
-        this->bytes_left_ -= 4;
-        this->bit_buffer_length_ = 32;
-        return false;
-    }
-#endif
-    if (this->bytes_left_ == 3) {
-        this->bit_buffer_ =
-            (static_cast<bit_buffer_t>(this->buffer_[this->buffer_index_]) << 16) |
-            (static_cast<bit_buffer_t>(this->buffer_[this->buffer_index_ + 1]) << 8) |
-            this->buffer_[this->buffer_index_ + 2];
-        this->buffer_index_ += 3;
-        this->bit_buffer_length_ = 24;
-        this->bytes_left_ = 0;
-        return false;
-    }
-    if (this->bytes_left_ == 2) {
-        this->bit_buffer_ = (static_cast<bit_buffer_t>(this->buffer_[this->buffer_index_]) << 8) |
-                            this->buffer_[this->buffer_index_ + 1];
-        this->buffer_index_ += 2;
-        this->bit_buffer_length_ = 16;
-        this->bytes_left_ = 0;
-        return false;
-    }
-    if (this->bytes_left_ == 1) {
-        this->bit_buffer_ = this->buffer_[this->buffer_index_];
-        this->buffer_index_ += 1;
-        this->bit_buffer_length_ = 8;
-        this->bytes_left_ = 0;
-        return false;
-    }
-    return true;
-}
-
 FLAC_ALWAYS_INLINE uint32_t FLACDecoder::read_uint(uint8_t num_bits) {
-    uint32_t result = 0;
-
-    if (num_bits > this->bit_buffer_length_) {
-        const uint32_t new_bits_needed = num_bits - this->bit_buffer_length_;
-        size_t bytes_needed = (new_bits_needed + 7) / 8;
-
-        if (FLAC_UNLIKELY(this->bytes_left_ < bytes_needed)) {
-            this->out_of_data_ = true;
-            return 0;
-        }
-
-        if (new_bits_needed < BIT_BUFFER_BITS) {
-            // Some of the current bits will be used in the result
-            result = static_cast<uint32_t>(this->bit_buffer_ << new_bits_needed);
-        }
-
-        this->refill_bit_buffer();
-        this->bit_buffer_length_ = static_cast<uint8_t>(this->bit_buffer_length_ - new_bits_needed);
-    } else {
-        this->bit_buffer_length_ -= num_bits;
+    BitReaderLocal s{};
+    s.bit_buffer = this->bit_buffer_;
+    s.bit_buffer_length = this->bit_buffer_length_;
+    s.buffer = this->buffer_;
+    s.buffer_index = this->buffer_index_;
+    s.bytes_left = this->bytes_left_;
+
+    bool oo_d = false;
+    const uint32_t result = read_uint_local(s, num_bits, oo_d);
+
+    this->bit_buffer_ = s.bit_buffer;
+    this->bit_buffer_length_ = static_cast<uint8_t>(s.bit_buffer_length);
+    this->buffer_index_ = s.buffer_index;
+    this->bytes_left_ = s.bytes_left;
+    if (FLAC_UNLIKELY(oo_d)) {
+        this->out_of_data_ = true;
     }
-
-    result |= static_cast<uint32_t>(this->bit_buffer_ >>
-                                    (this->bit_buffer_length_ & BIT_BUFFER_SHIFT_MASK));
-
-    result &= uint_mask(num_bits);
-
     return result;
 }
 
diff --git a/src/frame_header.cpp b/src/frame_header.cpp
index d171279..ace6a90 100644
--- a/src/frame_header.cpp
+++ b/src/frame_header.cpp
@@ -71,12 +71,8 @@ uint8_t compute_frame_header_length(const uint8_t* header) {
 FLACDecoderResult parse_frame_header(const uint8_t* header, uint8_t header_len,
                                      const FLACStreamInfo& stream_info, bool crc_check,
                                      FrameHeaderInfo& info) {
-    // Bytes 0-1: sync code (already validated by decode_frame_header_phase)
-
-    // Reserved bit check
-    if (header[1] & 0x02) {
-        return FLAC_DECODER_ERROR_BAD_HEADER;
-    }
+    // Bytes 0-1: sync code + reserved bit (already validated by
+    // decode_frame_header_phase via the 0xFE mask, which forces header[1] bit 1 = 0)
 
     // Byte 2: block_size_code (upper nibble) + sample_rate_code (lower nibble)
     if (header[2] == 0xFF) {
diff --git a/src/lpc.cpp b/src/lpc.cpp
index b09b1c5..072ba5d 100644
--- a/src/lpc.cpp
+++ b/src/lpc.cpp
@@ -14,6 +14,7 @@
 
 #include "lpc.h"
 
+#include "compiler.h"
 #include "xtensa/lpc_xtensa.h"
 
 #include <cstdint>
@@ -79,8 +80,9 @@ static bool can_use_lpc_32bit(uint32_t bits_per_sample, const int32_t* coefs, ui
 }
 
 template <uint32_t ORDER>
-static void restore_lpc_32bit_order(int32_t* sub_frame_buffer, size_t num_of_samples,
-                                    const int32_t* coefs, int32_t shift) {
+FLAC_NO_SANITIZE_OVERFLOW static void restore_lpc_32bit_order(int32_t* sub_frame_buffer,
+                                                              size_t num_of_samples,
+                                                              const int32_t* coefs, int32_t shift) {
     const size_t outer_loop_bound = num_of_samples - ORDER;
 
     for (size_t i = 0; i < outer_loop_bound; ++i) {
@@ -92,8 +94,9 @@ static void restore_lpc_32bit_order(int32_t* sub_frame_buffer, size_t num_of_sam
     }
 }
 
-static void restore_lpc_32bit(int32_t* sub_frame_buffer, size_t num_of_samples,
-                              const int32_t* coefs, uint32_t order, int32_t shift) {
+FLAC_NO_SANITIZE_OVERFLOW static void restore_lpc_32bit(int32_t* sub_frame_buffer,
+                                                        size_t num_of_samples, const int32_t* coefs,
+                                                        uint32_t order, int32_t shift) {
 #if (FLAC_LPC_XTENSA_ENABLED == 1)
     // Use optimized assembly version for Xtensa
     restore_lpc_32bit_asm(sub_frame_buffer, num_of_samples, coefs, order, shift);
@@ -115,7 +118,7 @@ static void restore_lpc_32bit(int32_t* sub_frame_buffer, size_t num_of_samples,
         case 12: restore_lpc_32bit_order<12>(sub_frame_buffer, num_of_samples, coefs, shift); break;
             // NOLINTEND(readability-magic-numbers)
         // clang-format on
-        default: {
+        default:
             const size_t outer_loop_bound = num_of_samples - order;
             for (size_t i = 0; i < outer_loop_bound; ++i) {
                 int32_t sum = 0;
@@ -125,14 +128,14 @@ static void restore_lpc_32bit(int32_t* sub_frame_buffer, size_t num_of_samples,
                 sub_frame_buffer[i + order] += (sum >> shift);
             }
             break;
-        }
     }
 #endif
 }
 
 template <uint32_t ORDER>
-static void restore_lpc_64bit_order(int32_t* sub_frame_buffer, size_t num_of_samples,
-                                    const int32_t* coefs, int32_t shift) {
+FLAC_NO_SANITIZE_OVERFLOW static void restore_lpc_64bit_order(int32_t* sub_frame_buffer,
+                                                              size_t num_of_samples,
+                                                              const int32_t* coefs, int32_t shift) {
     const size_t outer_loop_bound = num_of_samples - ORDER;
 
     for (size_t i = 0; i < outer_loop_bound; ++i) {
@@ -144,8 +147,9 @@ static void restore_lpc_64bit_order(int32_t* sub_frame_buffer, size_t num_of_sam
     }
 }
 
-static void restore_lpc_64bit(int32_t* sub_frame_buffer, size_t num_of_samples,
-                              const int32_t* coefs, uint32_t order, int32_t shift) {
+FLAC_NO_SANITIZE_OVERFLOW static void restore_lpc_64bit(int32_t* sub_frame_buffer,
+                                                        size_t num_of_samples, const int32_t* coefs,
+                                                        uint32_t order, int32_t shift) {
 #if (FLAC_LPC_XTENSA_ENABLED == 1)
     // Use optimized 64-bit assembly version for Xtensa
     restore_lpc_64bit_asm(sub_frame_buffer, num_of_samples, coefs, order, shift);
@@ -167,7 +171,7 @@ static void restore_lpc_64bit(int32_t* sub_frame_buffer, size_t num_of_samples,
         case 12: restore_lpc_64bit_order<12>(sub_frame_buffer, num_of_samples, coefs, shift); break;
             // NOLINTEND(readability-magic-numbers)
         // clang-format on
-        default: {
+        default:
             const size_t outer_loop_bound = num_of_samples - order;
             for (size_t i = 0; i < outer_loop_bound; ++i) {
                 int64_t sum = 0;
@@ -178,13 +182,13 @@ static void restore_lpc_64bit(int32_t* sub_frame_buffer, size_t num_of_samples,
                 sub_frame_buffer[i + order] += static_cast<int32_t>(sum >> shift);
             }
             break;
-        }
     }
 #endif
 }
 
-void restore_lpc(int32_t* sub_frame_buffer, size_t num_of_samples, uint32_t bits_per_sample,
-                 const int32_t* coefs, uint32_t order, int32_t shift) {
+FLAC_NO_SANITIZE_OVERFLOW void restore_lpc(int32_t* sub_frame_buffer, size_t num_of_samples,
+                                           uint32_t bits_per_sample, const int32_t* coefs,
+                                           uint32_t order, int32_t shift) {
 #ifdef MICRO_FLAC_DUMP_LPC_VECTORS
     const uint32_t max_dump = 256;
     uint32_t save_count = (static_cast<uint32_t>(num_of_samples) < max_dump)
@@ -208,8 +212,9 @@ void restore_lpc(int32_t* sub_frame_buffer, size_t num_of_samples, uint32_t bits
 #endif
 }
 
-void restore_lpc(int64_t* sub_frame_buffer, size_t num_of_samples, uint32_t /*bits_per_sample*/,
-                 const int32_t* coefs, uint32_t order, int32_t shift) {
+FLAC_NO_SANITIZE_OVERFLOW void restore_lpc(int64_t* sub_frame_buffer, size_t num_of_samples,
+                                           uint32_t /*bits_per_sample*/, const int32_t* coefs,
+                                           uint32_t order, int32_t shift) {
     const size_t outer_loop_bound = num_of_samples - order;
 
     for (size_t i = 0; i < outer_loop_bound; ++i) {
diff --git a/src/pcm_packing.cpp b/src/pcm_packing.cpp
index 61ced5b..f3107ca 100644
--- a/src/pcm_packing.cpp
+++ b/src/pcm_packing.cpp
@@ -94,6 +94,50 @@ static void write_samples_16bit_2ch(uint8_t* output_buffer, const int32_t* block
     }
 }
 
+FLAC_OPTIMIZE_O3
+static void write_samples_24bit_2ch_aligned(uint8_t* output_buffer, const int32_t* block_samples,
+                                            uint32_t block_size) {
+    // 24-bit stereo fast path for 4-byte-aligned buffers. Packs 2 stereo
+    // pairs (12 bytes) into 3 uint32_t stores per iteration instead of 12
+    // byte stores. Caller has verified output_buffer alignment.
+    //
+    // Byte layout of 2 stereo pairs in memory (little-endian):
+    //   L0[0] L0[1] L0[2] R0[0]   R0[1] R0[2] L1[0] L1[1]   L1[2] R1[0] R1[1] R1[2]
+    //   \------ word0 ------/     \------ word1 ------/     \------ word2 ------/
+    uint32_t* out32 = reinterpret_cast<uint32_t*>(output_buffer);
+    const int32_t* left = block_samples;
+    const int32_t* right = block_samples + block_size;
+
+    uint32_t i = 0;
+    const uint32_t unroll_limit = block_size & ~1U;
+
+    for (; i < unroll_limit; i += 2) {
+        const uint32_t l0 = static_cast<uint32_t>(left[i]);
+        const uint32_t r0 = static_cast<uint32_t>(right[i]);
+        const uint32_t l1 = static_cast<uint32_t>(left[i + 1]);
+        const uint32_t r1 = static_cast<uint32_t>(right[i + 1]);
+
+        out32[0] = (l0 & 0xFFFFFFU) | (r0 << 24);       // NOLINT(readability-magic-numbers)
+        out32[1] = ((r0 >> 8) & 0xFFFFU) | (l1 << 16);  // NOLINT(readability-magic-numbers)
+        out32[2] = ((l1 >> 16) & 0xFFU) | (r1 << 8);    // NOLINT(readability-magic-numbers)
+        out32 += 3;
+    }
+
+    // Odd-count tail: one stereo sample (6 bytes) remains. Fall back to byte
+    // stores for the final pair to keep the 2-sample fast path simple.
+    if (i < block_size) {
+        uint8_t* tail = reinterpret_cast<uint8_t*>(out32);
+        const int32_t sample_l = left[i];
+        const int32_t sample_r = right[i];
+        tail[0] = static_cast<uint8_t>(sample_l & 0xFF);
+        tail[1] = static_cast<uint8_t>((sample_l >> 8) & 0xFF);
+        tail[2] = static_cast<uint8_t>((sample_l >> 16) & 0xFF);
+        tail[3] = static_cast<uint8_t>(sample_r & 0xFF);
+        tail[4] = static_cast<uint8_t>((sample_r >> 8) & 0xFF);
+        tail[5] = static_cast<uint8_t>((sample_r >> 16) & 0xFF);
+    }
+}
+
 FLAC_OPTIMIZE_O3
 static void write_samples_24bit_2ch(uint8_t* output_buffer, const int32_t* block_samples,
                                     uint32_t block_size) {
@@ -245,6 +289,8 @@ void write_samples(uint8_t* output_buffer, const int32_t* block_samples, uint32_
             write_samples_16bit_1ch(output_buffer, block_samples, block_size);
         } else if (aligned_2 && bits_per_sample == 16 && num_channels == 2) {
             write_samples_16bit_2ch(output_buffer, block_samples, block_size);
+        } else if (aligned_4 && bits_per_sample == 24 && num_channels == 2) {
+            write_samples_24bit_2ch_aligned(output_buffer, block_samples, block_size);
         } else if (bits_per_sample == 24 && num_channels == 2) {
             write_samples_24bit_2ch(output_buffer, block_samples, block_size);
         } else {