diff --git a/README.md b/README.md index 354787f..d4cd779 100644 --- a/README.md +++ b/README.md @@ -163,10 +163,12 @@ Decoding performance for 48kHz stereo audio (full frame, CRC enabled): | Chip | Clock | 16-bit | 24-bit | | ---- | ----- | ------ | ------ | -| ESP32-S3 | 240 MHz | ~25x realtime | ~17x realtime | -| ESP32-P4 | 360 MHz | ~23x realtime | ~16x realtime | +| ESP32 (internal SRAM) | 240 MHz | ~12x realtime | n/a | +| ESP32 (PSRAM) | 240 MHz | ~8x realtime | n/a | +| ESP32-S3 | 240 MHz | ~30x realtime | ~19x realtime | +| ESP32-P4 | 360 MHz | ~25x realtime | ~18x realtime | -Performance varies with block size, prediction order, and sample depth (24-bit requires 64-bit arithmetic). See [examples/decode_benchmark/README.md](examples/decode_benchmark/README.md) for detailed benchmarks, streaming overhead analysis, and instructions for running your own. +ESP32-S3 and ESP32-P4 numbers are measured with the working buffer in PSRAM (the default); PSRAM is fast enough on these chips that switching to internal SRAM only saves ~2-4% on the S3 and well under 1% on the P4. On the original ESP32, PSRAM is much slower than internal SRAM, so placing the working buffer in internal memory (`CONFIG_MICRO_FLAC_PREFER_INTERNAL=y`) is roughly 30-35% faster and is recommended for performance-sensitive use. Performance also varies with block size, prediction order, and sample depth (24-bit requires 64-bit arithmetic). See [examples/decode_benchmark/README.md](examples/decode_benchmark/README.md) for detailed benchmarks, streaming overhead analysis, and instructions for running your own. ### Memory Usage diff --git a/examples/decode_benchmark/README.md b/examples/decode_benchmark/README.md index 629c218..ae0fd68 100644 --- a/examples/decode_benchmark/README.md +++ b/examples/decode_benchmark/README.md @@ -91,12 +91,12 @@ The benchmark runs each chunk size first with CRC disabled, then with CRC enable CRC Disabled CRC Enabled Test Case Time (ms) Real-time Time (ms) Real-time -------------------- ---------- --------- ---------- --------- - Full frame 1117.8 26.8x 1201.5 25.0x - 1000 byte chunks 1122.1 26.7x 1206.7 24.9x - 500 byte chunks 1127.6 26.6x 1212.8 24.7x - 100 byte chunks 1171.5 25.6x 1260.4 23.8x - 4 byte chunks 2473.3 12.1x 2642.9 11.4x - 1 byte chunks 6769.5 4.4x 7208.0 4.2x + Full frame 918.26 32.7x 991.73 30.3x + 1000 byte chunks 922.81 32.5x 997.24 30.1x + 500 byte chunks 928.47 32.3x 1003.55 29.9x + 100 byte chunks 975.27 30.8x 1053.93 28.5x + 4 byte chunks 2373.16 12.6x 2527.86 11.9x + 1 byte chunks 6935.53 4.3x 7296.88 4.1x ``` ### ESP32-S3 @ 240 MHz (24-bit/48 kHz stereo, 30 seconds, packed 24-bit output) @@ -109,12 +109,12 @@ The benchmark runs each chunk size first with CRC disabled, then with CRC enable CRC Disabled CRC Enabled Test Case Time (ms) Real-time Time (ms) Real-time -------------------- ---------- --------- ---------- --------- - Full frame 1622.7 18.5x 1810.0 16.6x - 1000 byte chunks 1633.2 18.4x 1819.6 16.5x - 500 byte chunks 1645.2 18.2x 1832.6 16.4x - 100 byte chunks 1740.0 17.2x 1935.9 15.5x - 4 byte chunks 4604.2 6.5x 4977.4 6.0x - 1 byte chunks 13553.6 2.2x 14439.6 2.1x + Full frame 1385.14 21.7x 1550.19 19.4x + 1000 byte chunks 1396.60 21.5x 1560.53 19.2x + 500 byte chunks 1409.14 21.3x 1574.06 19.1x + 100 byte chunks 1510.16 19.9x 1682.95 17.8x + 4 byte chunks 4580.11 6.6x 4919.77 6.1x + 1 byte chunks 14542.14 2.1x 15336.69 2.0x ``` ### ESP32-S3 @ 240 MHz (24-bit/48 kHz stereo, 30 seconds, 32-bit output) @@ -127,15 +127,15 @@ The benchmark runs each chunk size first with CRC disabled, then with CRC enable CRC Disabled CRC Enabled Test Case Time (ms) Real-time Time (ms) Real-time -------------------- ---------- --------- ---------- --------- - Full frame 1589.8 18.9x 1778.4 16.9x - 1000 byte chunks 1601.2 18.7x 1787.8 16.8x - 500 byte chunks 1613.2 18.6x 1800.9 16.7x - 100 byte chunks 1707.6 17.6x 1903.3 15.8x - 4 byte chunks 4555.4 6.6x 4928.5 6.1x - 1 byte chunks 13455.1 2.2x 14341.4 2.1x + Full frame 1364.75 22.0x 1531.08 19.6x + 1000 byte chunks 1376.54 21.8x 1541.01 19.5x + 500 byte chunks 1389.18 21.6x 1554.69 19.3x + 100 byte chunks 1489.55 20.1x 1662.72 18.0x + 4 byte chunks 4538.67 6.6x 4878.53 6.1x + 1 byte chunks 14435.03 2.1x 15229.60 2.0x ``` -Streaming with chunks of 100 bytes or larger has negligible overhead compared to full-frame decoding. CRC checking adds roughly 5-8% overhead for 16-bit and ~10-12% for 24-bit audio. +Streaming with chunks of 100 bytes or larger has negligible overhead compared to full-frame decoding. CRC checking adds roughly ~8% overhead for 16-bit and ~12% for 24-bit audio. ## Interpreting Results @@ -149,13 +149,16 @@ RTF = decode_time / audio_duration ### Expected Performance -| Device | Clock | Bit depth | Expected RTF | Real-time | -|--------|-------|-----------|--------------|-----------| -| ESP32 | 240 MHz | 16-bit | 0.12-0.14 | 7-8x | -| ESP32-S3 | 240 MHz | 16-bit | 0.037-0.040 | 25-27x | -| ESP32-S3 | 240 MHz | 24-bit | 0.054-0.061 | 16-19x | -| ESP32-P4 | 360 MHz | 16-bit | 0.042-0.044 | 23-24x | -| ESP32-P4 | 360 MHz | 24-bit | 0.055-0.061 | 16-18x | +| Device | Clock | Bit depth | Working buffer | Expected RTF | Real-time | +|--------|-------|-----------|----------------|--------------|-----------| +| ESP32 | 240 MHz | 16-bit | PSRAM | 0.107-0.131 | 7-9x | +| ESP32 | 240 MHz | 16-bit | Internal | 0.079-0.087 | 11-13x | +| ESP32-S3 | 240 MHz | 16-bit | PSRAM | 0.031-0.035 | 28-33x | +| ESP32-S3 | 240 MHz | 24-bit | PSRAM | 0.046-0.056 | 18-22x | +| ESP32-P4 | 360 MHz | 16-bit | PSRAM | 0.037-0.041 | 25-27x | +| ESP32-P4 | 360 MHz | 24-bit | PSRAM | 0.050-0.058 | 17-20x | + +On the original ESP32, PSRAM access is much slower than internal SRAM, so placing the working buffer in internal memory (`CONFIG_MICRO_FLAC_PREFER_INTERNAL=y`) is roughly 30-35% faster. On the ESP32-S3, the same switch saves only ~2% (16-bit) to ~4% (24-bit), and on the ESP32-P4 it is below 1%. The S3/P4 numbers above are measured with the default PSRAM placement, and switching to internal SRAM yields essentially the same range. Performance varies based on: diff --git a/include/micro_flac/flac_decoder.h b/include/micro_flac/flac_decoder.h index 9a3011c..47d26fe 100644 --- a/include/micro_flac/flac_decoder.h +++ b/include/micro_flac/flac_decoder.h @@ -560,10 +560,12 @@ class FLACDecoder { /// @brief Read partition parameter and escape bits, advancing stage accordingly FLACDecoderResult read_partition_param(uint32_t block_size, uint32_t warm_up_samples); - /// @brief Read Rice-coded signed integer - /// @tparam Resuming false = fresh read (hot path), true = resume after out-of-data - template - inline int32_t read_rice_sint(uint8_t param); + /// @brief Decode one non-escape Rice partition (out-of-lined on purpose). + /// Kept non-inline so the tight loop gets a clean register file, free of + /// pressure from the surrounding subframe state machine. + template + FLACDecoderResult decode_rice_partition(OutputT* out_ptr, uint8_t rice_param, + uint32_t partition_count); /// @brief Drain remaining unconsumed bytes from user buffer into bit_buffer_ void drain_remaining_to_bit_buffer(); @@ -572,9 +574,6 @@ class FLACDecoder { // Bit Stream Reading // ======================================== - /// @brief Refill bit buffer from input stream - inline bool refill_bit_buffer(); - /// @brief Read unsigned integer of specified bit width inline uint32_t read_uint(uint8_t num_bits); diff --git a/src/README.md b/src/README.md index e0a8045..66e9b08 100644 --- a/src/README.md +++ b/src/README.md @@ -14,7 +14,8 @@ Based on [Nayuki's Simple FLAC Implementation](https://www.nayuki.io/res/simple- ### Core Decoder -- `flac_decoder.cpp` - Main decoder: state machine, container detection, header/metadata parsing, subframe decoding, residual decoding, bitstream reading +- `flac_decoder.cpp` - Main decoder: state machine, container detection, header/metadata parsing, subframe decoding, residual decoding +- `bit_reader.h` - Header-only bit-stream primitives: `BitReaderLocal` state struct plus `refill_bit_buffer_local()`, `read_uint_local()`, `read_rice_sint_local()`. Header-only so `FLAC_ALWAYS_INLINE` is honored at every call site - `frame_header.h` / `frame_header.cpp` - Frame header parsing: `compute_frame_header_length()`, `parse_frame_header()` (sync validation, field extraction, CRC-8 check, STREAMINFO validation) - `decorrelation.h` / `decorrelation.cpp` - Stereo channel decorrelation: `apply_channel_decorrelation()` for LEFT_SIDE, RIGHT_SIDE, and MID_SIDE joint stereo modes @@ -125,7 +126,9 @@ After all subframes are decoded, channel decorrelation is applied via `apply_cha ### Bitstream Reading -The decoder uses a platform-sized bit buffer: 64-bit on host/64-bit platforms (refilled 8 bytes at a time) and 32-bit on ESP32/32-bit platforms (refilled 4 bytes at a time). This avoids unnecessary 64-bit arithmetic on embedded targets while reducing refill frequency on desktop. Read functions are inlined. +The bit-stream primitives live in `bit_reader.h` as header-only `FLAC_ALWAYS_INLINE` functions operating on a `BitReaderLocal` stack struct. Hoisting bit-reader state into a local struct lets the compiler keep it in registers across hot loops, avoiding aliasing-induced spills through the decoder's member fields. + +The decoder uses a platform-sized bit buffer: 64-bit on host/64-bit platforms (refilled 8 bytes at a time) and 32-bit on ESP32/32-bit platforms (refilled 4 bytes at a time). This avoids unnecessary 64-bit arithmetic on embedded targets while reducing refill frequency on desktop. ### LPC Accumulator Type Selection diff --git a/src/bit_reader.h b/src/bit_reader.h new file mode 100644 index 0000000..4c60e30 --- /dev/null +++ b/src/bit_reader.h @@ -0,0 +1,306 @@ +// Copyright 2026 Kevin Ahrendt +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "compiler.h" +#include "micro_flac/flac_decoder.h" + +#include +#include + +namespace micro_flac { + +// Generate a bitmask with num_bits set to 1 (e.g., num_bits=3 -> 0b111 = 7) +// This replaces the UINT_MASK lookup table with bit manipulation for better performance +static FLAC_ALWAYS_INLINE uint32_t uint_mask(uint32_t num_bits) { + return (num_bits >= 32) ? UINT32_MAX : ((1U << num_bits) - 1); +} + +// Mask for bit buffer width (used to mask shift amounts) +static constexpr uint32_t BIT_BUFFER_SHIFT_MASK = BIT_BUFFER_BITS - 1; + +// ============================================================================ +// Local bit-reader state for the Rice partition hot loop +// ============================================================================ +// +// Hoisting the decoder's bit-reader fields (bit_buffer_, bit_buffer_length_, +// buffer_, buffer_index_, bytes_left_) into a stack-local struct lets GCC +// promote them into registers across the sample loop. Without this, writes +// through out_ptr can alias the member fields in GCC's view, causing it to +// spill/reload bit_buffer_ (and others) on every iteration. +// +// These helpers are the single implementation of the bit-stream primitives. +// FLACDecoder::read_uint is a thin wrapper that copies state between `this->` +// and a stack-local struct around a call to read_uint_local. +// Rice-coded residual reads (decode_rice_partition) use read_rice_sint_local +// directly against a shared BitReaderLocal to avoid paying for an extra +// store/load across the resume/hot-loop boundary. +struct BitReaderLocal { + bit_buffer_t bit_buffer; + const uint8_t* buffer; + size_t buffer_index; + size_t bytes_left; + uint32_t bit_buffer_length; +}; + +// Refill the bit buffer from the input stream. Returns true on out-of-data +// (zero bytes remaining); otherwise loads as many bytes as fit into +// bit_buffer, advances buffer_index/bytes_left, and sets bit_buffer_length. +// +// ESP-IDF disables jump tables by default (-fno-jump-tables), so a switch +// statement compiles to a chain of comparisons anyway. Using explicit +// if/else with FLAC_LIKELY on the hot path lets the compiler prioritize it. +// +// All paths overwrite bit_buffer with only the newly loaded bytes. Old bits +// are NOT preserved. This is safe because both callers handle old bits +// before calling refill: +// - read_uint_local() extracts old bits into its local `result` before +// calling refill +// - read_rice_sint_local() only calls refill when bit_buffer_length == 0 +// (no old bits) +static FLAC_ALWAYS_INLINE bool refill_bit_buffer_local(BitReaderLocal& s) { +#if (BIT_BUFFER_BITS == 64) + if (FLAC_LIKELY(s.bytes_left >= 8)) { + s.bit_buffer = (static_cast(s.buffer[s.buffer_index]) << 56) | + (static_cast(s.buffer[s.buffer_index + 1]) << 48) | + (static_cast(s.buffer[s.buffer_index + 2]) << 40) | + (static_cast(s.buffer[s.buffer_index + 3]) << 32) | + (static_cast(s.buffer[s.buffer_index + 4]) << 24) | + (static_cast(s.buffer[s.buffer_index + 5]) << 16) | + (static_cast(s.buffer[s.buffer_index + 6]) << 8) | + s.buffer[s.buffer_index + 7]; + s.buffer_index += 8; + s.bytes_left -= 8; + s.bit_buffer_length = 64; + return false; + } + if (s.bytes_left == 7) { + s.bit_buffer = (static_cast(s.buffer[s.buffer_index]) << 48) | + (static_cast(s.buffer[s.buffer_index + 1]) << 40) | + (static_cast(s.buffer[s.buffer_index + 2]) << 32) | + (static_cast(s.buffer[s.buffer_index + 3]) << 24) | + (static_cast(s.buffer[s.buffer_index + 4]) << 16) | + (static_cast(s.buffer[s.buffer_index + 5]) << 8) | + s.buffer[s.buffer_index + 6]; + s.buffer_index += 7; + s.bit_buffer_length = 56; + s.bytes_left = 0; + return false; + } + if (s.bytes_left == 6) { + s.bit_buffer = (static_cast(s.buffer[s.buffer_index]) << 40) | + (static_cast(s.buffer[s.buffer_index + 1]) << 32) | + (static_cast(s.buffer[s.buffer_index + 2]) << 24) | + (static_cast(s.buffer[s.buffer_index + 3]) << 16) | + (static_cast(s.buffer[s.buffer_index + 4]) << 8) | + s.buffer[s.buffer_index + 5]; + s.buffer_index += 6; + s.bit_buffer_length = 48; + s.bytes_left = 0; + return false; + } + if (s.bytes_left == 5) { + s.bit_buffer = (static_cast(s.buffer[s.buffer_index]) << 32) | + (static_cast(s.buffer[s.buffer_index + 1]) << 24) | + (static_cast(s.buffer[s.buffer_index + 2]) << 16) | + (static_cast(s.buffer[s.buffer_index + 3]) << 8) | + s.buffer[s.buffer_index + 4]; + s.buffer_index += 5; + s.bit_buffer_length = 40; + s.bytes_left = 0; + return false; + } + if (s.bytes_left == 4) { + s.bit_buffer = (static_cast(s.buffer[s.buffer_index]) << 24) | + (static_cast(s.buffer[s.buffer_index + 1]) << 16) | + (static_cast(s.buffer[s.buffer_index + 2]) << 8) | + s.buffer[s.buffer_index + 3]; + s.buffer_index += 4; + s.bit_buffer_length = 32; + s.bytes_left = 0; + return false; + } +#else + if (FLAC_LIKELY(s.bytes_left >= 4)) { + s.bit_buffer = (static_cast(s.buffer[s.buffer_index]) << 24) | + (static_cast(s.buffer[s.buffer_index + 1]) << 16) | + (static_cast(s.buffer[s.buffer_index + 2]) << 8) | + s.buffer[s.buffer_index + 3]; + s.buffer_index += 4; + s.bytes_left -= 4; + s.bit_buffer_length = 32; + return false; + } +#endif + if (s.bytes_left == 3) { + s.bit_buffer = (static_cast(s.buffer[s.buffer_index]) << 16) | + (static_cast(s.buffer[s.buffer_index + 1]) << 8) | + s.buffer[s.buffer_index + 2]; + s.buffer_index += 3; + s.bit_buffer_length = 24; + s.bytes_left = 0; + return false; + } + if (s.bytes_left == 2) { + s.bit_buffer = (static_cast(s.buffer[s.buffer_index]) << 8) | + s.buffer[s.buffer_index + 1]; + s.buffer_index += 2; + s.bit_buffer_length = 16; + s.bytes_left = 0; + return false; + } + if (s.bytes_left == 1) { + s.bit_buffer = s.buffer[s.buffer_index]; + s.buffer_index += 1; + s.bit_buffer_length = 8; + s.bytes_left = 0; + return false; + } + return true; +} + +// Read an unsigned integer of `num_bits` bits using only the local state +// struct. On out-of-data, sets `out_of_data=true` and returns 0 without +// consuming any bits (bit_buffer/bit_buffer_length are left untouched in +// that case, matching the member read_uint contract). +static FLAC_ALWAYS_INLINE uint32_t read_uint_local(BitReaderLocal& s, uint8_t num_bits, + bool& out_of_data) { + uint32_t result = 0; + + if (num_bits > s.bit_buffer_length) { + const uint32_t new_bits_needed = num_bits - s.bit_buffer_length; + const size_t bytes_needed = (new_bits_needed + 7) / 8; + + if (FLAC_UNLIKELY(s.bytes_left < bytes_needed)) { + out_of_data = true; + return 0; + } + + if (new_bits_needed < BIT_BUFFER_BITS) { + // Some of the current bits will be used in the result + result = static_cast(s.bit_buffer << new_bits_needed); + } + + refill_bit_buffer_local(s); + s.bit_buffer_length = s.bit_buffer_length - new_bits_needed; + } else { + s.bit_buffer_length -= num_bits; + } + + result |= static_cast(s.bit_buffer >> (s.bit_buffer_length & BIT_BUFFER_SHIFT_MASK)); + result &= uint_mask(num_bits); + return result; +} + +// Read a Rice-coded signed integer using only the local state struct. +// `mask` must equal `(1U << param) - 1` and is passed explicitly so the +// caller can pin it to a register across the partition loop (the inlined +// uint_mask otherwise tends to spill to the stack). +// On out-of-data, sets *out_of_data=true and writes partial-progress info +// into *unary_count_out and *binary_pending_out so the caller can persist +// it into the decoder's RiceState for resume. +// +// Template param `Resuming`: +// false (hot path): start unary_count=0 and always run the unary phase. +// true : seed unary_count from `unary_count_in`, and skip the +// unary phase when `binary_pending_in` is true (the +// unary phase already completed on a prior call). +// With FLAC_ALWAYS_INLINE, the `Resuming` branches fold away at both call +// sites, so the hot-path instruction sequence is identical to the previous +// non-templated helper. +template +static FLAC_ALWAYS_INLINE int32_t read_rice_sint_local( + BitReaderLocal& s, uint8_t param, uint32_t mask, bool* out_of_data, uint32_t* unary_count_out, + bool* binary_pending_out, uint32_t unary_count_in = 0, bool binary_pending_in = false) { + uint32_t unary_count = Resuming ? unary_count_in : 0; + if (!Resuming || !binary_pending_in) { + while (true) { + if (s.bit_buffer_length == 0) { + if (FLAC_UNLIKELY(refill_bit_buffer_local(s))) { + *unary_count_out = unary_count; + *binary_pending_out = false; + *out_of_data = true; + return 0; + } + } + bit_buffer_t shifted = s.bit_buffer << (BIT_BUFFER_BITS - s.bit_buffer_length); + if (FLAC_UNLIKELY(shifted == 0)) { + unary_count += s.bit_buffer_length; + s.bit_buffer_length = 0; + continue; + } + uint32_t leading_zeros = static_cast(FLAC_CLZ(shifted)); + unary_count += leading_zeros; + s.bit_buffer_length = s.bit_buffer_length - (leading_zeros + 1); + break; + } + } + + // Rice parameter is structurally bounded to < 32 by the FLAC spec + // (RFC 9639 §9.2.7): the partition parameter is read as a 4-bit or 5-bit + // field (residual coding method 0 or 1), giving max values of 15 and 31 + // respectively. For method 1, value 31 is the escape marker; this + // function is only invoked from the non-escape branch, so param is + // guaranteed <= 30. Even with corrupted input, read_uint(N) physically + // cannot return more than 2^N - 1, so the invariant holds regardless. + // + // Hinting this to the compiler lets it drop the `num_bits >= 32` guard + // inside uint_mask / read_uint_local, which in turn lets register + // allocation keep `buffer_` pinned in a register (previously occupied by + // the constant 31) and eliminates a per-sample conditional mask reload. + // + // Safety note: this assumption is also relied on by `(unary_count << param)` + // below, which is UB for param >= 32. We are making an already-required + // invariant explicit to the optimizer, not introducing a new one. + FLAC_ASSUME(param < 32); + + // Binary phase. Inline the fast path so the precomputed `mask` is used + // directly instead of going through uint_mask (which, even with the + // unreachable hint above, tends to spill the computed mask to the stack). + uint32_t binary = 0; + if (FLAC_LIKELY(param <= s.bit_buffer_length)) { + s.bit_buffer_length = s.bit_buffer_length - param; + binary = + static_cast(s.bit_buffer >> (s.bit_buffer_length & BIT_BUFFER_SHIFT_MASK)) & + mask; + } else { + // Slow path: binary field straddles a refill. Inline read_uint here so + // we can reuse the caller-provided `mask` instead of rebuilding it via + // uint_mask (which tends to spill). Since param < 32 and + // bit_buffer_length >= 0, new_bits_needed is in [1, 31] (always + // strictly less than BIT_BUFFER_BITS on both 32- and 64-bit hosts), so + // we can skip the read_uint_local `new_bits_needed >= BIT_BUFFER_BITS` + // edge case. + const uint32_t new_bits_needed = param - s.bit_buffer_length; + const size_t bytes_needed = (new_bits_needed + 7) / 8; + if (FLAC_UNLIKELY(s.bytes_left < bytes_needed)) { + *unary_count_out = unary_count; + *binary_pending_out = true; + *out_of_data = true; + return 0; + } + const uint32_t high = static_cast(s.bit_buffer << new_bits_needed); + refill_bit_buffer_local(s); + s.bit_buffer_length = s.bit_buffer_length - new_bits_needed; + binary = (high | static_cast(s.bit_buffer >> + (s.bit_buffer_length & BIT_BUFFER_SHIFT_MASK))) & + mask; + } + + uint32_t value = (unary_count << param) | binary; + return static_cast((value >> 1) ^ -(value & 1)); +} + +} // namespace micro_flac diff --git a/src/compiler.h b/src/compiler.h index 7ca9d50..2d16ab1 100644 --- a/src/compiler.h +++ b/src/compiler.h @@ -45,6 +45,17 @@ #define FLAC_HOT #endif +// Prevent inlining. Useful for extracting a tight loop into its own function +// so the compiler can allocate registers for it without pressure from a +// surrounding large switch/state machine. +#if defined(__GNUC__) || defined(__clang__) +#define FLAC_NOINLINE __attribute__((noinline)) +#elif defined(_MSC_VER) +#define FLAC_NOINLINE __declspec(noinline) +#else +#define FLAC_NOINLINE +#endif + // Branch prediction hints #if defined(__GNUC__) || defined(__clang__) #define FLAC_LIKELY(x) __builtin_expect(!!(x), 1) @@ -54,6 +65,33 @@ #define FLAC_UNLIKELY(x) (x) #endif +// Hint to the optimizer that an expression is true. Used to prune impossible +// branches and tighten codegen on the hot path. Clang has `__builtin_assume`; +// GCC lacks a direct equivalent, so fall back to the +// `if (!x) __builtin_unreachable();` idiom (which evaluates `x`, unlike +// __builtin_assume — keep `x` side-effect free). +#if defined(__clang__) +#define FLAC_ASSUME(x) __builtin_assume(x) +#elif defined(__GNUC__) +#define FLAC_ASSUME(x) \ + do { \ + if (!(x)) \ + __builtin_unreachable(); \ + } while (0) +#else +#define FLAC_ASSUME(x) ((void)0) +#endif + +// Silence UBSan signed-integer-overflow in LPC restore functions. +// FLAC LPC prediction intentionally uses wrapping int32_t arithmetic; +// overflows in this path are audio-only and not a security concern. +// Same approach as libFLAC (FLAC__lpc_restore_signal). +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && (defined(__GNUC__) || defined(__clang__)) +#define FLAC_NO_SANITIZE_OVERFLOW __attribute__((no_sanitize("signed-integer-overflow"))) +#else +#define FLAC_NO_SANITIZE_OVERFLOW +#endif + #include namespace micro_flac { diff --git a/src/crc.cpp b/src/crc.cpp index 94479ab..a9193c2 100644 --- a/src/crc.cpp +++ b/src/crc.cpp @@ -250,28 +250,48 @@ uint8_t calculate_crc8(const uint8_t* data, size_t len) { } FLAC_OPTIMIZE_O3 -uint16_t update_crc16(uint16_t crc, const uint8_t* data, size_t len) { +uint16_t update_crc16(uint16_t crc_in, const uint8_t* data, size_t len) { const uint8_t* end = data + len; + // Hold CRC in a 32-bit local so GCC doesn't insert a per-iteration + // `extui crc, crc, 0, 16` to honor uint16_t semantics. We only read bits + // [15:8] via a bit-field extract and XOR-in a 16-bit table value, so the + // upper bits are harmless until we narrow on return. + uint32_t crc = crc_in; + + const uint8_t* end8 = data + (len & ~static_cast(7)); // Round down to multiple of 8 #if UINTPTR_MAX != 0xFFFFFFFF // On 64-bit hosts, process 8 bytes at a time using slicing-by-8 - const uint8_t* end8 = data + (len & ~7U); // Round down to multiple of 8 - while (data < end8) { - crc = static_cast( - CRC16_TABLE_7[(crc >> 8) ^ data[0]] ^ CRC16_TABLE_6[(crc & 0xFF) ^ data[1]] ^ - CRC16_TABLE_5[data[2]] ^ CRC16_TABLE_4[data[3]] ^ CRC16_TABLE_3[data[4]] ^ - CRC16_TABLE_2[data[5]] ^ CRC16_TABLE_1[data[6]] ^ CRC16_TABLE_0[data[7]]); + crc = CRC16_TABLE_7[((crc >> 8) & 0xFF) ^ data[0]] ^ CRC16_TABLE_6[(crc & 0xFF) ^ data[1]] ^ + CRC16_TABLE_5[data[2]] ^ CRC16_TABLE_4[data[3]] ^ CRC16_TABLE_3[data[4]] ^ + CRC16_TABLE_2[data[5]] ^ CRC16_TABLE_1[data[6]] ^ CRC16_TABLE_0[data[7]]; + data += 8; + } +#else + // On 32-bit hosts, process 8 bytes per iteration using only TABLE_0. + // Amortizes the pointer advance and loop overhead across 8 bytes + // (7 + 1/8 = 7.125 insts/byte vs 8 for byte-at-a-time). Uses only + // TABLE_0 so no extra cache/flash pressure from additional tables. + while (data < end8) { + crc = (crc << 8) ^ CRC16_TABLE_0[((crc >> 8) & 0xFF) ^ data[0]]; + crc = (crc << 8) ^ CRC16_TABLE_0[((crc >> 8) & 0xFF) ^ data[1]]; + crc = (crc << 8) ^ CRC16_TABLE_0[((crc >> 8) & 0xFF) ^ data[2]]; + crc = (crc << 8) ^ CRC16_TABLE_0[((crc >> 8) & 0xFF) ^ data[3]]; + crc = (crc << 8) ^ CRC16_TABLE_0[((crc >> 8) & 0xFF) ^ data[4]]; + crc = (crc << 8) ^ CRC16_TABLE_0[((crc >> 8) & 0xFF) ^ data[5]]; + crc = (crc << 8) ^ CRC16_TABLE_0[((crc >> 8) & 0xFF) ^ data[6]]; + crc = (crc << 8) ^ CRC16_TABLE_0[((crc >> 8) & 0xFF) ^ data[7]]; data += 8; } #endif // UINTPTR_MAX != 0xFFFFFFFF - // Byte-at-a-time (handles remaining bytes on 64-bit, all bytes on 32-bit) + // Byte-at-a-time (handles trailing bytes 0..7) while (data < end) { - crc = static_cast((crc << 8) ^ CRC16_TABLE_0[(crc >> 8) ^ *data++]); + crc = (crc << 8) ^ CRC16_TABLE_0[((crc >> 8) & 0xFF) ^ *data++]; } - return crc; + return static_cast(crc); } } // namespace micro_flac diff --git a/src/decorrelation.cpp b/src/decorrelation.cpp index 4be6243..e246a1c 100644 --- a/src/decorrelation.cpp +++ b/src/decorrelation.cpp @@ -55,8 +55,9 @@ void apply_channel_decorrelation(int32_t* block_samples, const SideT* side_chann for (; i < block_size; i++) { block_samples[i] = wadd32(side_channel[i], block_samples[block_size + i]); } - } else if (channel_assignment == CHANNEL_MID_SIDE) { - // MID_SIDE: Left = Mid + Side/2, Right = Mid - Side/2 + } else { + // MID_SIDE (caller guarantees channel_assignment is 8, 9, or 10). + // Left = Mid + Side/2, Right = Mid - Side/2 // Arithmetic right shift for side>>1 is required by the FLAC spec. // With SideT=int64_t, the shift operates on the full 33-bit value. // Process 4 samples at a time diff --git a/src/flac_decoder.cpp b/src/flac_decoder.cpp index 9f76ca3..ca9398a 100644 --- a/src/flac_decoder.cpp +++ b/src/flac_decoder.cpp @@ -15,6 +15,7 @@ #include "micro_flac/flac_decoder.h" #include "alloc.h" +#include "bit_reader.h" #include "compiler.h" #include "crc.h" #include "decorrelation.h" @@ -55,14 +56,8 @@ static constexpr const int32_t* FIXED_COEFFICIENTS[] = {nullptr, FIXED_COEFFICIE FIXED_COEFFICIENTS_2, FIXED_COEFFICIENTS_3, FIXED_COEFFICIENTS_4}; -// Generate a bitmask with num_bits set to 1 (e.g., num_bits=3 -> 0b111 = 7) -// This replaces the UINT_MASK lookup table with bit manipulation for better performance -static FLAC_ALWAYS_INLINE uint32_t uint_mask(uint32_t num_bits) { - return (num_bits >= 32) ? UINT32_MAX : ((1U << num_bits) - 1); -} - -// Mask for bit buffer width (used to mask shift amounts) -static constexpr uint32_t BIT_BUFFER_SHIFT_MASK = BIT_BUFFER_BITS - 1; +// The bit-reader primitives (BitReaderLocal, refill_bit_buffer_local, +// read_uint_local, read_rice_sint_local) live in bit_reader.h. // ============================================================================ // FLACMetadataBlock Lifecycle @@ -769,7 +764,7 @@ FLAC_HOT FLACDecoderResult FLACDecoder::decode_frame(const uint8_t* buffer, size return FLAC_DECODER_ERROR_INTERNAL; } -FLAC_HOT void FLACDecoder::reset_frame_state() { +void FLACDecoder::reset_frame_state() { this->frame_ = FrameState{}; this->subframe_ = SubframeState{}; this->residual_ = ResidualState{}; @@ -1323,39 +1318,12 @@ FLAC_OPTIMIZE_O3 FLACDecoderResult FLACDecoder::decode_residuals(OutputT* sub_fr } this->residual_.sample_idx = sample_idx_e; } else { - // Residuals are always decoded as int32_t even on the int64_t (wide-side) - // path. Per RFC 9639 §9.2.7.3, Rice-coded residuals must fit in a signed - // 32-bit two's complement integer (excluding INT32_MIN), so int32_t is - // sufficient regardless of the output sample type. - - // Hoist struct fields into locals for the hot loop. - // This lets the compiler keep them in registers instead - // of reloading from memory on every iteration. - uint32_t sample_idx = this->residual_.sample_idx; - const uint32_t partition_count = this->residual_.partition_count; const uint8_t rice_param = static_cast(this->residual_.param); - - // If resuming mid-rice-read, finish that one sample first - if (this->rice_.pending) { - int32_t val = this->read_rice_sint(rice_param); - if (FLAC_UNLIKELY(this->out_of_data_)) { - this->residual_.sample_idx = sample_idx; - return FLAC_DECODER_NEED_MORE_DATA; - } - out_ptr[sample_idx] = val; - sample_idx++; - this->rice_.pending = false; - } - for (; sample_idx < partition_count; sample_idx++) { - int32_t val = this->read_rice_sint(rice_param); - if (FLAC_UNLIKELY(this->out_of_data_)) { - this->residual_.sample_idx = sample_idx; - this->rice_.pending = true; - return FLAC_DECODER_NEED_MORE_DATA; - } - out_ptr[sample_idx] = val; + FLACDecoderResult ret = this->decode_rice_partition( + out_ptr, rice_param, this->residual_.partition_count); + if (ret != FLAC_DECODER_SUCCESS) { + return ret; } - this->residual_.sample_idx = sample_idx; } this->residual_.out_ptr_offset += this->residual_.partition_count; @@ -1379,6 +1347,100 @@ FLAC_OPTIMIZE_O3 FLACDecoderResult FLACDecoder::decode_residuals(OutputT* sub_fr template FLACDecoderResult FLACDecoder::decode_residuals(int32_t*, uint32_t, uint32_t); template FLACDecoderResult FLACDecoder::decode_residuals(int64_t*, uint32_t, uint32_t); +// Non-inline so the hot Rice sample loop gets a fresh register file. The +// surrounding subframe state machine holds many live values; isolating this +// loop in its own function lets the compiler allocate registers just for +// bit-reader state + partition locals. +template +FLAC_HOT FLAC_NOINLINE FLACDecoderResult +FLACDecoder::decode_rice_partition(OutputT* out_ptr, uint8_t rice_param, uint32_t partition_count) { + // Residuals are always decoded as int32_t even on the int64_t (wide-side) path. + // Per RFC 9639 §9.2.7.3, Rice-coded residuals must fit in a signed 32-bit two's + // complement integer (excluding INT32_MIN), so int32_t is sufficient regardless + // of the output sample type. + uint32_t sample_idx = this->residual_.sample_idx; + + // Hoist bit-reader state into a stack-local struct so the compiler can + // keep it in registers across the loop instead of reloading through this-> + // on every iteration. Shared between the resume prefix and the hot loop + // so we don't pay an extra store/load around the resume. + BitReaderLocal st{}; + st.bit_buffer = this->bit_buffer_; + st.bit_buffer_length = this->bit_buffer_length_; + st.buffer = this->buffer_; + st.buffer_index = this->buffer_index_; + st.bytes_left = this->bytes_left_; + + bool oo_d = false; + uint32_t unary_pending = 0; + bool binary_pending = false; + + // Precompute the Rice mask and try to pin it to a register across the loop. + // On Xtensa, GCC otherwise spills the mask to the stack and reloads it + // inside the inlined binary extraction on every sample (an extra l32i per + // residual on the hot path). Binding to a15m, the last callee-saved + // general-purpose register in the current window, keeps it live without + // displacing the bit-reader state that's already hot in registers. +#if defined(__xtensa__) || defined(__XTENSA__) + register uint32_t rice_mask asm("a15") = (static_cast(1) << rice_param) - 1; +#else + const uint32_t rice_mask = (static_cast(1) << rice_param) - 1; +#endif + + // If resuming mid-rice-read, finish that one sample on the resume path + // before entering the non-resuming hot loop. Rice parameter is < 32 + // (see note inside read_rice_sint_local); the assert here guards that + // invariant locally. + if (FLAC_UNLIKELY(this->rice_.pending)) { + assert(rice_param < 32); + int32_t val = read_rice_sint_local(st, rice_param, rice_mask, &oo_d, &unary_pending, + &binary_pending, this->rice_.unary_count, + this->rice_.binary_pending); + if (FLAC_UNLIKELY(oo_d)) { + this->bit_buffer_ = st.bit_buffer; + this->bit_buffer_length_ = static_cast(st.bit_buffer_length); + this->buffer_index_ = st.buffer_index; + this->bytes_left_ = st.bytes_left; + this->out_of_data_ = true; + this->rice_.unary_count = unary_pending; + this->rice_.binary_pending = binary_pending; + this->residual_.sample_idx = sample_idx; + return FLAC_DECODER_NEED_MORE_DATA; + } + out_ptr[sample_idx] = val; + sample_idx++; + this->rice_.pending = false; + } + + for (; sample_idx < partition_count; sample_idx++) { + int32_t val = + read_rice_sint_local(st, rice_param, rice_mask, &oo_d, &unary_pending, &binary_pending); + if (FLAC_UNLIKELY(oo_d)) { + this->bit_buffer_ = st.bit_buffer; + this->bit_buffer_length_ = static_cast(st.bit_buffer_length); + this->buffer_index_ = st.buffer_index; + this->bytes_left_ = st.bytes_left; + this->out_of_data_ = true; + this->rice_.unary_count = unary_pending; + this->rice_.binary_pending = binary_pending; + this->rice_.pending = true; + this->residual_.sample_idx = sample_idx; + return FLAC_DECODER_NEED_MORE_DATA; + } + out_ptr[sample_idx] = val; + } + + this->bit_buffer_ = st.bit_buffer; + this->bit_buffer_length_ = static_cast(st.bit_buffer_length); + this->buffer_index_ = st.buffer_index; + this->bytes_left_ = st.bytes_left; + this->residual_.sample_idx = sample_idx; + return FLAC_DECODER_SUCCESS; +} + +template FLACDecoderResult FLACDecoder::decode_rice_partition(int32_t*, uint8_t, uint32_t); +template FLACDecoderResult FLACDecoder::decode_rice_partition(int64_t*, uint8_t, uint32_t); + FLAC_HOT FLACDecoderResult FLACDecoder::read_partition_param(uint32_t block_size, uint32_t warm_up_samples) { if (this->subframe_.stage == SubframeDecodeStage::RESIDUAL_PARTITION_PARAM) { @@ -1417,54 +1479,6 @@ FLAC_HOT FLACDecoderResult FLACDecoder::read_partition_param(uint32_t block_size return FLAC_DECODER_SUCCESS; } -template -FLAC_ALWAYS_INLINE int32_t FLACDecoder::read_rice_sint(uint8_t param) { - uint32_t unary_count = Resuming ? this->rice_.unary_count : 0; - - if (!Resuming || !this->rice_.binary_pending) { - // Unary phase: count leading zeros - while (true) { - if (this->bit_buffer_length_ == 0) { - if (FLAC_UNLIKELY(this->refill_bit_buffer())) { - this->rice_.unary_count = unary_count; - this->rice_.binary_pending = false; - this->out_of_data_ = true; - return 0; - } - } - - bit_buffer_t shifted_buffer = this->bit_buffer_ - << (BIT_BUFFER_BITS - this->bit_buffer_length_); - - if (FLAC_UNLIKELY(shifted_buffer == 0)) { - unary_count += this->bit_buffer_length_; - this->bit_buffer_length_ = 0; - continue; - } - - uint32_t leading_zeros = static_cast(FLAC_CLZ(shifted_buffer)); - unary_count += leading_zeros; - this->bit_buffer_length_ = - static_cast(this->bit_buffer_length_ - (leading_zeros + 1)); - break; - } - } - - // Binary phase: read rice parameter bits - uint32_t binary = this->read_uint(param); - if (FLAC_UNLIKELY(this->out_of_data_)) { - this->rice_.unary_count = unary_count; - this->rice_.binary_pending = true; - return 0; - } - - // Rice parameter is at most 30 (5-bit param, with 31 reserved as escape code), - // so shifting a uint32_t left by param is always well-defined. - assert(param < 32); - uint32_t value = (unary_count << param) | binary; - return static_cast((value >> 1) ^ -(value & 1)); -} - void FLACDecoder::drain_remaining_to_bit_buffer() { // Drain unconsumed bytes from user's buffer into bit_buffer_. // Safe because: when read_uint fails, bit_buffer_length_ + 8*bytes_left_ < BIT_BUFFER_BITS @@ -1480,146 +1494,24 @@ void FLACDecoder::drain_remaining_to_bit_buffer() { // Bit Stream Reading // ============================================================================ -FLAC_ALWAYS_INLINE bool FLACDecoder::refill_bit_buffer() { - // ESP-IDF disables jump tables by default (-fno-jump-tables), so a switch statement - // compiles to a chain of comparisons anyway. Using explicit if/else with FLAC_LIKELY - // on the hot path lets the compiler prioritize it. - // - // All paths overwrite bit_buffer_ with only the newly loaded bytes. Old bits are NOT - // preserved. This is safe because both callers handle old bits before calling refill: - // - read_uint() extracts old bits into its local `result` before calling refill - // - read_rice_sint() only calls refill when bit_buffer_length_ == 0 (no old bits) -#if (BIT_BUFFER_BITS == 64) - if (FLAC_LIKELY(this->bytes_left_ >= 8)) { - // 8 or more bytes available: load 8 bytes big-endian - this->bit_buffer_ = (static_cast(this->buffer_[this->buffer_index_]) << 56) | - (static_cast(this->buffer_[this->buffer_index_ + 1]) << 48) | - (static_cast(this->buffer_[this->buffer_index_ + 2]) << 40) | - (static_cast(this->buffer_[this->buffer_index_ + 3]) << 32) | - (static_cast(this->buffer_[this->buffer_index_ + 4]) << 24) | - (static_cast(this->buffer_[this->buffer_index_ + 5]) << 16) | - (static_cast(this->buffer_[this->buffer_index_ + 6]) << 8) | - this->buffer_[this->buffer_index_ + 7]; - this->buffer_index_ += 8; - this->bytes_left_ -= 8; - this->bit_buffer_length_ = 64; - return false; - } - if (this->bytes_left_ == 7) { - this->bit_buffer_ = (static_cast(this->buffer_[this->buffer_index_]) << 48) | - (static_cast(this->buffer_[this->buffer_index_ + 1]) << 40) | - (static_cast(this->buffer_[this->buffer_index_ + 2]) << 32) | - (static_cast(this->buffer_[this->buffer_index_ + 3]) << 24) | - (static_cast(this->buffer_[this->buffer_index_ + 4]) << 16) | - (static_cast(this->buffer_[this->buffer_index_ + 5]) << 8) | - this->buffer_[this->buffer_index_ + 6]; - this->buffer_index_ += 7; - this->bit_buffer_length_ = 56; - this->bytes_left_ = 0; - return false; - } - if (this->bytes_left_ == 6) { - this->bit_buffer_ = (static_cast(this->buffer_[this->buffer_index_]) << 40) | - (static_cast(this->buffer_[this->buffer_index_ + 1]) << 32) | - (static_cast(this->buffer_[this->buffer_index_ + 2]) << 24) | - (static_cast(this->buffer_[this->buffer_index_ + 3]) << 16) | - (static_cast(this->buffer_[this->buffer_index_ + 4]) << 8) | - this->buffer_[this->buffer_index_ + 5]; - this->buffer_index_ += 6; - this->bit_buffer_length_ = 48; - this->bytes_left_ = 0; - return false; - } - if (this->bytes_left_ == 5) { - this->bit_buffer_ = (static_cast(this->buffer_[this->buffer_index_]) << 32) | - (static_cast(this->buffer_[this->buffer_index_ + 1]) << 24) | - (static_cast(this->buffer_[this->buffer_index_ + 2]) << 16) | - (static_cast(this->buffer_[this->buffer_index_ + 3]) << 8) | - this->buffer_[this->buffer_index_ + 4]; - this->buffer_index_ += 5; - this->bit_buffer_length_ = 40; - this->bytes_left_ = 0; - return false; - } - if (this->bytes_left_ == 4) { - this->bit_buffer_ = (static_cast(this->buffer_[this->buffer_index_]) << 24) | - (static_cast(this->buffer_[this->buffer_index_ + 1]) << 16) | - (static_cast(this->buffer_[this->buffer_index_ + 2]) << 8) | - this->buffer_[this->buffer_index_ + 3]; - this->buffer_index_ += 4; - this->bit_buffer_length_ = 32; - this->bytes_left_ = 0; - return false; - } -#else - if (FLAC_LIKELY(this->bytes_left_ >= 4)) { - // 4 or more bytes available: load 4 bytes big-endian - this->bit_buffer_ = (static_cast(this->buffer_[this->buffer_index_]) << 24) | - (static_cast(this->buffer_[this->buffer_index_ + 1]) << 16) | - (static_cast(this->buffer_[this->buffer_index_ + 2]) << 8) | - this->buffer_[this->buffer_index_ + 3]; - this->buffer_index_ += 4; - this->bytes_left_ -= 4; - this->bit_buffer_length_ = 32; - return false; - } -#endif - if (this->bytes_left_ == 3) { - this->bit_buffer_ = - (static_cast(this->buffer_[this->buffer_index_]) << 16) | - (static_cast(this->buffer_[this->buffer_index_ + 1]) << 8) | - this->buffer_[this->buffer_index_ + 2]; - this->buffer_index_ += 3; - this->bit_buffer_length_ = 24; - this->bytes_left_ = 0; - return false; - } - if (this->bytes_left_ == 2) { - this->bit_buffer_ = (static_cast(this->buffer_[this->buffer_index_]) << 8) | - this->buffer_[this->buffer_index_ + 1]; - this->buffer_index_ += 2; - this->bit_buffer_length_ = 16; - this->bytes_left_ = 0; - return false; - } - if (this->bytes_left_ == 1) { - this->bit_buffer_ = this->buffer_[this->buffer_index_]; - this->buffer_index_ += 1; - this->bit_buffer_length_ = 8; - this->bytes_left_ = 0; - return false; - } - return true; -} - FLAC_ALWAYS_INLINE uint32_t FLACDecoder::read_uint(uint8_t num_bits) { - uint32_t result = 0; - - if (num_bits > this->bit_buffer_length_) { - const uint32_t new_bits_needed = num_bits - this->bit_buffer_length_; - size_t bytes_needed = (new_bits_needed + 7) / 8; - - if (FLAC_UNLIKELY(this->bytes_left_ < bytes_needed)) { - this->out_of_data_ = true; - return 0; - } - - if (new_bits_needed < BIT_BUFFER_BITS) { - // Some of the current bits will be used in the result - result = static_cast(this->bit_buffer_ << new_bits_needed); - } - - this->refill_bit_buffer(); - this->bit_buffer_length_ = static_cast(this->bit_buffer_length_ - new_bits_needed); - } else { - this->bit_buffer_length_ -= num_bits; + BitReaderLocal s{}; + s.bit_buffer = this->bit_buffer_; + s.bit_buffer_length = this->bit_buffer_length_; + s.buffer = this->buffer_; + s.buffer_index = this->buffer_index_; + s.bytes_left = this->bytes_left_; + + bool oo_d = false; + const uint32_t result = read_uint_local(s, num_bits, oo_d); + + this->bit_buffer_ = s.bit_buffer; + this->bit_buffer_length_ = static_cast(s.bit_buffer_length); + this->buffer_index_ = s.buffer_index; + this->bytes_left_ = s.bytes_left; + if (FLAC_UNLIKELY(oo_d)) { + this->out_of_data_ = true; } - - result |= static_cast(this->bit_buffer_ >> - (this->bit_buffer_length_ & BIT_BUFFER_SHIFT_MASK)); - - result &= uint_mask(num_bits); - return result; } diff --git a/src/frame_header.cpp b/src/frame_header.cpp index d171279..ace6a90 100644 --- a/src/frame_header.cpp +++ b/src/frame_header.cpp @@ -71,12 +71,8 @@ uint8_t compute_frame_header_length(const uint8_t* header) { FLACDecoderResult parse_frame_header(const uint8_t* header, uint8_t header_len, const FLACStreamInfo& stream_info, bool crc_check, FrameHeaderInfo& info) { - // Bytes 0-1: sync code (already validated by decode_frame_header_phase) - - // Reserved bit check - if (header[1] & 0x02) { - return FLAC_DECODER_ERROR_BAD_HEADER; - } + // Bytes 0-1: sync code + reserved bit (already validated by + // decode_frame_header_phase via the 0xFE mask, which forces header[1] bit 1 = 0) // Byte 2: block_size_code (upper nibble) + sample_rate_code (lower nibble) if (header[2] == 0xFF) { diff --git a/src/lpc.cpp b/src/lpc.cpp index b09b1c5..072ba5d 100644 --- a/src/lpc.cpp +++ b/src/lpc.cpp @@ -14,6 +14,7 @@ #include "lpc.h" +#include "compiler.h" #include "xtensa/lpc_xtensa.h" #include @@ -79,8 +80,9 @@ static bool can_use_lpc_32bit(uint32_t bits_per_sample, const int32_t* coefs, ui } template -static void restore_lpc_32bit_order(int32_t* sub_frame_buffer, size_t num_of_samples, - const int32_t* coefs, int32_t shift) { +FLAC_NO_SANITIZE_OVERFLOW static void restore_lpc_32bit_order(int32_t* sub_frame_buffer, + size_t num_of_samples, + const int32_t* coefs, int32_t shift) { const size_t outer_loop_bound = num_of_samples - ORDER; for (size_t i = 0; i < outer_loop_bound; ++i) { @@ -92,8 +94,9 @@ static void restore_lpc_32bit_order(int32_t* sub_frame_buffer, size_t num_of_sam } } -static void restore_lpc_32bit(int32_t* sub_frame_buffer, size_t num_of_samples, - const int32_t* coefs, uint32_t order, int32_t shift) { +FLAC_NO_SANITIZE_OVERFLOW static void restore_lpc_32bit(int32_t* sub_frame_buffer, + size_t num_of_samples, const int32_t* coefs, + uint32_t order, int32_t shift) { #if (FLAC_LPC_XTENSA_ENABLED == 1) // Use optimized assembly version for Xtensa restore_lpc_32bit_asm(sub_frame_buffer, num_of_samples, coefs, order, shift); @@ -115,7 +118,7 @@ static void restore_lpc_32bit(int32_t* sub_frame_buffer, size_t num_of_samples, case 12: restore_lpc_32bit_order<12>(sub_frame_buffer, num_of_samples, coefs, shift); break; // NOLINTEND(readability-magic-numbers) // clang-format on - default: { + default: const size_t outer_loop_bound = num_of_samples - order; for (size_t i = 0; i < outer_loop_bound; ++i) { int32_t sum = 0; @@ -125,14 +128,14 @@ static void restore_lpc_32bit(int32_t* sub_frame_buffer, size_t num_of_samples, sub_frame_buffer[i + order] += (sum >> shift); } break; - } } #endif } template -static void restore_lpc_64bit_order(int32_t* sub_frame_buffer, size_t num_of_samples, - const int32_t* coefs, int32_t shift) { +FLAC_NO_SANITIZE_OVERFLOW static void restore_lpc_64bit_order(int32_t* sub_frame_buffer, + size_t num_of_samples, + const int32_t* coefs, int32_t shift) { const size_t outer_loop_bound = num_of_samples - ORDER; for (size_t i = 0; i < outer_loop_bound; ++i) { @@ -144,8 +147,9 @@ static void restore_lpc_64bit_order(int32_t* sub_frame_buffer, size_t num_of_sam } } -static void restore_lpc_64bit(int32_t* sub_frame_buffer, size_t num_of_samples, - const int32_t* coefs, uint32_t order, int32_t shift) { +FLAC_NO_SANITIZE_OVERFLOW static void restore_lpc_64bit(int32_t* sub_frame_buffer, + size_t num_of_samples, const int32_t* coefs, + uint32_t order, int32_t shift) { #if (FLAC_LPC_XTENSA_ENABLED == 1) // Use optimized 64-bit assembly version for Xtensa restore_lpc_64bit_asm(sub_frame_buffer, num_of_samples, coefs, order, shift); @@ -167,7 +171,7 @@ static void restore_lpc_64bit(int32_t* sub_frame_buffer, size_t num_of_samples, case 12: restore_lpc_64bit_order<12>(sub_frame_buffer, num_of_samples, coefs, shift); break; // NOLINTEND(readability-magic-numbers) // clang-format on - default: { + default: const size_t outer_loop_bound = num_of_samples - order; for (size_t i = 0; i < outer_loop_bound; ++i) { int64_t sum = 0; @@ -178,13 +182,13 @@ static void restore_lpc_64bit(int32_t* sub_frame_buffer, size_t num_of_samples, sub_frame_buffer[i + order] += static_cast(sum >> shift); } break; - } } #endif } -void restore_lpc(int32_t* sub_frame_buffer, size_t num_of_samples, uint32_t bits_per_sample, - const int32_t* coefs, uint32_t order, int32_t shift) { +FLAC_NO_SANITIZE_OVERFLOW void restore_lpc(int32_t* sub_frame_buffer, size_t num_of_samples, + uint32_t bits_per_sample, const int32_t* coefs, + uint32_t order, int32_t shift) { #ifdef MICRO_FLAC_DUMP_LPC_VECTORS const uint32_t max_dump = 256; uint32_t save_count = (static_cast(num_of_samples) < max_dump) @@ -208,8 +212,9 @@ void restore_lpc(int32_t* sub_frame_buffer, size_t num_of_samples, uint32_t bits #endif } -void restore_lpc(int64_t* sub_frame_buffer, size_t num_of_samples, uint32_t /*bits_per_sample*/, - const int32_t* coefs, uint32_t order, int32_t shift) { +FLAC_NO_SANITIZE_OVERFLOW void restore_lpc(int64_t* sub_frame_buffer, size_t num_of_samples, + uint32_t /*bits_per_sample*/, const int32_t* coefs, + uint32_t order, int32_t shift) { const size_t outer_loop_bound = num_of_samples - order; for (size_t i = 0; i < outer_loop_bound; ++i) { diff --git a/src/pcm_packing.cpp b/src/pcm_packing.cpp index 61ced5b..f3107ca 100644 --- a/src/pcm_packing.cpp +++ b/src/pcm_packing.cpp @@ -94,6 +94,50 @@ static void write_samples_16bit_2ch(uint8_t* output_buffer, const int32_t* block } } +FLAC_OPTIMIZE_O3 +static void write_samples_24bit_2ch_aligned(uint8_t* output_buffer, const int32_t* block_samples, + uint32_t block_size) { + // 24-bit stereo fast path for 4-byte-aligned buffers. Packs 2 stereo + // pairs (12 bytes) into 3 uint32_t stores per iteration instead of 12 + // byte stores. Caller has verified output_buffer alignment. + // + // Byte layout of 2 stereo pairs in memory (little-endian): + // L0[0] L0[1] L0[2] R0[0] R0[1] R0[2] L1[0] L1[1] L1[2] R1[0] R1[1] R1[2] + // \------ word0 ------/ \------ word1 ------/ \------ word2 ------/ + uint32_t* out32 = reinterpret_cast(output_buffer); + const int32_t* left = block_samples; + const int32_t* right = block_samples + block_size; + + uint32_t i = 0; + const uint32_t unroll_limit = block_size & ~1U; + + for (; i < unroll_limit; i += 2) { + const uint32_t l0 = static_cast(left[i]); + const uint32_t r0 = static_cast(right[i]); + const uint32_t l1 = static_cast(left[i + 1]); + const uint32_t r1 = static_cast(right[i + 1]); + + out32[0] = (l0 & 0xFFFFFFU) | (r0 << 24); // NOLINT(readability-magic-numbers) + out32[1] = ((r0 >> 8) & 0xFFFFU) | (l1 << 16); // NOLINT(readability-magic-numbers) + out32[2] = ((l1 >> 16) & 0xFFU) | (r1 << 8); // NOLINT(readability-magic-numbers) + out32 += 3; + } + + // Odd-count tail: one stereo sample (6 bytes) remains. Fall back to byte + // stores for the final pair to keep the 2-sample fast path simple. + if (i < block_size) { + uint8_t* tail = reinterpret_cast(out32); + const int32_t sample_l = left[i]; + const int32_t sample_r = right[i]; + tail[0] = static_cast(sample_l & 0xFF); + tail[1] = static_cast((sample_l >> 8) & 0xFF); + tail[2] = static_cast((sample_l >> 16) & 0xFF); + tail[3] = static_cast(sample_r & 0xFF); + tail[4] = static_cast((sample_r >> 8) & 0xFF); + tail[5] = static_cast((sample_r >> 16) & 0xFF); + } +} + FLAC_OPTIMIZE_O3 static void write_samples_24bit_2ch(uint8_t* output_buffer, const int32_t* block_samples, uint32_t block_size) { @@ -245,6 +289,8 @@ void write_samples(uint8_t* output_buffer, const int32_t* block_samples, uint32_ write_samples_16bit_1ch(output_buffer, block_samples, block_size); } else if (aligned_2 && bits_per_sample == 16 && num_channels == 2) { write_samples_16bit_2ch(output_buffer, block_samples, block_size); + } else if (aligned_4 && bits_per_sample == 24 && num_channels == 2) { + write_samples_24bit_2ch_aligned(output_buffer, block_samples, block_size); } else if (bits_per_sample == 24 && num_channels == 2) { write_samples_24bit_2ch(output_buffer, block_samples, block_size); } else {