|
| 1 | +// Copyright 2008 Dolphin Emulator Project |
| 2 | +// SPDX-License-Identifier: GPL-2.0-or-later |
| 3 | + |
| 4 | +#include "Core/HW/GranularMixer.h" |
| 5 | + |
| 6 | +#include <chrono> |
| 7 | +#include <algorithm> |
| 8 | +#include <cmath> |
| 9 | +#include <cstring> |
| 10 | + |
| 11 | +#include "Common/CommonTypes.h" |
| 12 | +#include "Common/Log.h" |
| 13 | +#include "Common/Math/math_util.h" |
| 14 | +#include "Common/Swap.h" |
| 15 | +#include "Core/HW/Display.h" |
| 16 | +#include "Core/Core.h" |
| 17 | +#include "Core/System.h" |
| 18 | +#include "Core/Util/AudioFormat.h" // for clamp_u16 |
| 19 | + |
| 20 | +// Something like a gaussian. |
| 21 | +static const float g_GranuleWindow[256] = { |
| 22 | + 0.0000016272f, 0.0000050749f, 0.0000113187f, 0.0000216492f, 0.0000377350f, 0.0000616906f, |
| 23 | + 0.0000961509f, 0.0001443499f, 0.0002102045f, 0.0002984010f, 0.0004144844f, 0.0005649486f, |
| 24 | + 0.0007573262f, 0.0010002765f, 0.0013036694f, 0.0016786636f, 0.0021377783f, 0.0026949534f, |
| 25 | + 0.0033656000f, 0.0041666352f, 0.0051165029f, 0.0062351752f, 0.0075441359f, 0.0090663409f, |
| 26 | + 0.0108261579f, 0.0128492811f, 0.0151626215f, 0.0177941726f, 0.0207728499f, 0.0241283062f, |
| 27 | + 0.0278907219f, 0.0320905724f, 0.0367583739f, 0.0419244083f, 0.0476184323f, 0.0538693708f, |
| 28 | + 0.0607049996f, 0.0681516192f, 0.0762337261f, 0.0849736833f, 0.0943913952f, 0.1045039915f, |
| 29 | + 0.1153255250f, 0.1268666867f, 0.1391345431f, 0.1521323012f, 0.1658591025f, 0.1803098534f, |
| 30 | + 0.1954750915f, 0.2113408944f, 0.2278888303f, 0.2450959552f, 0.2629348550f, 0.2813737361f, |
| 31 | + 0.3003765625f, 0.3199032396f, 0.3399098438f, 0.3603488941f, 0.3811696664f, 0.4023185434f, |
| 32 | + 0.4237393998f, 0.4453740162f, 0.4671625177f, 0.4890438330f, 0.5109561670f, 0.5328374823f, |
| 33 | + 0.5546259838f, 0.5762606002f, 0.5976814566f, 0.6188303336f, 0.6396511059f, 0.6600901562f, |
| 34 | + 0.6800967604f, 0.6996234375f, 0.7186262639f, 0.7370651450f, 0.7549040448f, 0.7721111697f, |
| 35 | + 0.7886591056f, 0.8045249085f, 0.8196901466f, 0.8341408975f, 0.8478676988f, 0.8608654569f, |
| 36 | + 0.8731333133f, 0.8846744750f, 0.8954960085f, 0.9056086048f, 0.9150263167f, 0.9237662739f, |
| 37 | + 0.9318483808f, 0.9392950004f, 0.9461306292f, 0.9523815677f, 0.9580755917f, 0.9632416261f, |
| 38 | + 0.9679094276f, 0.9721092781f, 0.9758716938f, 0.9792271501f, 0.9822058274f, 0.9848373785f, |
| 39 | + 0.9871507189f, 0.9891738421f, 0.9909336591f, 0.9924558641f, 0.9937648248f, 0.9948834971f, |
| 40 | + 0.9958333648f, 0.9966344000f, 0.9973050466f, 0.9978622217f, 0.9983213364f, 0.9986963306f, |
| 41 | + 0.9989997235f, 0.9992426738f, 0.9994350514f, 0.9995855156f, 0.9997015990f, 0.9997897955f, |
| 42 | + 0.9998556501f, 0.9999038491f, 0.9999383094f, 0.9999622650f, 0.9999783508f, 0.9999886813f, |
| 43 | + 0.9999949251f, 0.9999983728f, 0.9999983728f, 0.9999949251f, 0.9999886813f, 0.9999783508f, |
| 44 | + 0.9999622650f, 0.9999383094f, 0.9999038491f, 0.9998556501f, 0.9997897955f, 0.9997015990f, |
| 45 | + 0.9995855156f, 0.9994350514f, 0.9992426738f, 0.9989997235f, 0.9986963306f, 0.9983213364f, |
| 46 | + 0.9978622217f, 0.9973050466f, 0.9966344000f, 0.9958333648f, 0.9948834971f, 0.9937648248f, |
| 47 | + 0.9924558641f, 0.9909336591f, 0.9891738421f, 0.9871507189f, 0.9848373785f, 0.9822058274f, |
| 48 | + 0.9792271501f, 0.9758716938f, 0.9721092781f, 0.9679094276f, 0.9632416261f, 0.9580755917f, |
| 49 | + 0.9523815677f, 0.9461306292f, 0.9392950004f, 0.9318483808f, 0.9237662739f, 0.9150263167f, |
| 50 | + 0.9056086048f, 0.8954960085f, 0.8846744750f, 0.8731333133f, 0.8608654569f, 0.8478676988f, |
| 51 | + 0.8341408975f, 0.8196901466f, 0.8045249085f, 0.7886591056f, 0.7721111697f, 0.7549040448f, |
| 52 | + 0.7370651450f, 0.7186262639f, 0.6996234375f, 0.6800967604f, 0.6600901562f, 0.6396511059f, |
| 53 | + 0.6188303336f, 0.5976814566f, 0.5762606002f, 0.5546259838f, 0.5328374823f, 0.5109561670f, |
| 54 | + 0.4890438330f, 0.4671625177f, 0.4453740162f, 0.4237393998f, 0.4023185434f, 0.3811696664f, |
| 55 | + 0.3603488941f, 0.3399098438f, 0.3199032396f, 0.3003765625f, 0.2813737361f, 0.2629348550f, |
| 56 | + 0.2450959552f, 0.2278888303f, 0.2113408944f, 0.1954750915f, 0.1803098534f, 0.1658591025f, |
| 57 | + 0.1521323012f, 0.1391345431f, 0.1268666867f, 0.1153255250f, 0.1045039915f, 0.0943913952f, |
| 58 | + 0.0849736833f, 0.0762337261f, 0.0681516192f, 0.0607049996f, 0.0538693708f, 0.0476184323f, |
| 59 | + 0.0419244083f, 0.0367583739f, 0.0320905724f, 0.0278907219f, 0.0241283062f, 0.0207728499f, |
| 60 | + 0.0177941726f, 0.0151626215f, 0.0128492811f, 0.0108261579f, 0.0090663409f, 0.0075441359f, |
| 61 | + 0.0062351752f, 0.0051165029f, 0.0041666352f, 0.0033656000f, 0.0026949534f, 0.0021377783f, |
| 62 | + 0.0016786636f, 0.0013036694f, 0.0010002765f, 0.0007573262f, 0.0005649486f, 0.0004144844f, |
| 63 | + 0.0002984010f, 0.0002102045f, 0.0001443499f, 0.0000961509f, 0.0000616906f, 0.0000377350f, |
| 64 | + 0.0000216492f, 0.0000113187f, 0.0000050749f, 0.0000016272f |
| 65 | +}; |
| 66 | + |
| 67 | +inline s16 clampfloat_s16(float f) { |
| 68 | + if (f <= -32767.0f) return -32767; |
| 69 | + if (f >= 32767.0f) return 32767; |
| 70 | + return (s16)f; |
| 71 | +} |
| 72 | + |
| 73 | +GranularMixer::GranularMixer() { |
| 74 | + INFO_LOG(Log::Audio, "Mixer is initialized"); |
| 75 | +} |
| 76 | + |
| 77 | +// Executed from sound stream thread |
| 78 | +void GranularMixer::Mix(s16 *samples, u32 num_samples, int outSampleRate, float fpsEstimate) { |
| 79 | + _dbg_assert_(samples); |
| 80 | + if (!samples) |
| 81 | + return; |
| 82 | + memset(samples, 0, num_samples * 2 * sizeof(s16)); |
| 83 | + frameTimeEstimate_ = 1.0f / fpsEstimate; |
| 84 | + |
| 85 | + smoothedReadSize_ = smoothedReadSize_ == 0 ? num_samples : (smoothedReadSize_ * 0.95f + num_samples * 0.05f); |
| 86 | + |
| 87 | + constexpr u32 INDEX_HALF = 0x80000000; |
| 88 | + constexpr double FADE_IN_RC = 0.008; |
| 89 | + constexpr double FADE_OUT_RC = 0.064; |
| 90 | + |
| 91 | + // We need at least a double because the index jump has 24 bits of fractional precision. |
| 92 | + const double out_sample_rate = outSampleRate; |
| 93 | + double inSampleRate = 44100; |
| 94 | + |
| 95 | + const double emulation_speed = 1.0f; // TODO: Change when we're in slow-motion mode etc. |
| 96 | + if (0 < emulation_speed && emulation_speed != 1.0) |
| 97 | + inSampleRate *= emulation_speed; |
| 98 | + |
| 99 | + const double base = static_cast<double>(1 << GRANULE_FRAC_BITS); |
| 100 | + const u32 index_jump = std::lround(base * inSampleRate / out_sample_rate); |
| 101 | + |
| 102 | + // These fade in / out multiplier are tuned to match a constant |
| 103 | + // fade speed regardless of the input or the output sample rate. |
| 104 | + const float fade_in_mul = -std::expm1(-1.0 / (out_sample_rate * FADE_IN_RC)); |
| 105 | + const float fade_out_mul = -std::expm1(-1.0 / (out_sample_rate * FADE_OUT_RC)); |
| 106 | + |
| 107 | + // Calculate the ideal length of the granule queue. |
| 108 | + // NOTE: We must have enough room here for 20fps games, generating all their audio |
| 109 | + // in a burst each frame (since we can't force real clock sync). That means 16*3 = 48 or rather 50ms. |
| 110 | + // However, in case of faster framerates, we should apply some pressure to reduce this. And if real clock sync |
| 111 | + // is on, we should also be able to get away with a shorter buffer here. |
| 112 | + // const u32 buffer_size_ms = frameTimeEstimate_ * 44100.0f; |
| 113 | + const u32 buffer_size_samples = smoothedReadSize_ * 4 + std::llround(frameTimeEstimate_ * inSampleRate); |
| 114 | + queuedSamplesTarget_ = buffer_size_samples; |
| 115 | + |
| 116 | + // Limit the possible queue sizes to any number between 4 and 64. |
| 117 | + const u32 buffer_size_granules = |
| 118 | + std::clamp((buffer_size_samples) / (GRANULE_SIZE >> 1), static_cast<u32>(4), |
| 119 | + static_cast<u32>(MAX_GRANULE_QUEUE_SIZE)); |
| 120 | + |
| 121 | + if (buffer_size_granules != m_granule_queue_size.load(std::memory_order_relaxed)) { |
| 122 | + INFO_LOG(Log::Audio, "Granule buffer size changed to %d", buffer_size_granules); |
| 123 | + } |
| 124 | + |
| 125 | + m_granule_queue_size.store(buffer_size_granules, std::memory_order_relaxed); |
| 126 | + |
| 127 | + int actualQueueSize = m_queue_head - m_queue_tail; |
| 128 | + if (smoothedQueueSize_ == 0) { |
| 129 | + smoothedQueueSize_ = actualQueueSize; |
| 130 | + } else { |
| 131 | + constexpr float factor = 0.95f; |
| 132 | + smoothedQueueSize_ = factor * smoothedQueueSize_ + (1.0f - factor) * (float)actualQueueSize; |
| 133 | + } |
| 134 | + if (actualQueueSize < queuedGranulesMin_) { |
| 135 | + queuedGranulesMin_ = actualQueueSize; |
| 136 | + } |
| 137 | + if (actualQueueSize > queuedGranulesMax_) { |
| 138 | + queuedGranulesMax_ = actualQueueSize; |
| 139 | + } |
| 140 | + |
| 141 | + // TODO: The performance of this could be greatly enhanced with SIMD but it won't be easy |
| 142 | + // due to wrapping of various buffers. |
| 143 | + bool queue_looping = m_queue_looping.load(std::memory_order_relaxed); |
| 144 | + while (num_samples-- > 0) { |
| 145 | + // The indexes for the front and back buffers are offset by 50% of the granule size. |
| 146 | + // We use the modular nature of 32-bit integers to wrap around the granule size. |
| 147 | + m_current_index += index_jump; |
| 148 | + const u32 front_index = m_current_index; |
| 149 | + const u32 back_index = m_current_index + INDEX_HALF; |
| 150 | + |
| 151 | + // If either index is less than the index jump, that means we reached |
| 152 | + // the end of the of the buffer and need to load the next granule. |
| 153 | + if (front_index < index_jump) |
| 154 | + Dequeue(&m_front); |
| 155 | + else if (back_index < index_jump) |
| 156 | + Dequeue(&m_back); |
| 157 | + |
| 158 | + // The Granules are pre-windowed, so we can just add them together. A bit of accidental wrapping doesn't matter |
| 159 | + // either since the tails are so weak. |
| 160 | + const u32 ft = front_index >> GRANULE_FRAC_BITS; |
| 161 | + const u32 bt = back_index >> GRANULE_FRAC_BITS; |
| 162 | + const StereoPair s0 = m_front[(ft - 2) & GRANULE_MASK] + m_back[(bt - 2) & GRANULE_MASK]; |
| 163 | + const StereoPair s1 = m_front[(ft - 1) & GRANULE_MASK] + m_back[(bt - 1) & GRANULE_MASK]; |
| 164 | + const StereoPair s2 = m_front[(ft + 0) & GRANULE_MASK] + m_back[(bt + 0) & GRANULE_MASK]; |
| 165 | + const StereoPair s3 = m_front[(ft + 1) & GRANULE_MASK] + m_back[(bt + 1) & GRANULE_MASK]; |
| 166 | + const StereoPair s4 = m_front[(ft + 2) & GRANULE_MASK] + m_back[(bt + 2) & GRANULE_MASK]; |
| 167 | + const StereoPair s5 = m_front[(ft + 3) & GRANULE_MASK] + m_back[(bt + 3) & GRANULE_MASK]; |
| 168 | + |
| 169 | + // Probably an overkill interpolator, but let's go with it for now. |
| 170 | + // Polynomial Interpolators for High-Quality Resampling of |
| 171 | + // Over Sampled Audio by Olli Niemitalo, October 2001. |
| 172 | + // Page 43 -- 6-point, 3rd-order Hermite: |
| 173 | + // https://yehar.com/blog/wp-content/uploads/2009/08/deip.pdf |
| 174 | + const u32 t_frac = m_current_index & ((1 << GRANULE_FRAC_BITS) - 1); |
| 175 | + const float t1 = t_frac / static_cast<float>(1 << GRANULE_FRAC_BITS); |
| 176 | + const float t2 = t1 * t1; |
| 177 | + const float t3 = t2 * t1; |
| 178 | + StereoPair sample = ( |
| 179 | + s0 * ((+0.0f + 1.0f * t1 - 2.0f * t2 + 1.0f * t3) * (1.0f / 12.0f)) + |
| 180 | + s1 * ((+0.0f - 8.0f * t1 + 15.0f * t2 - 7.0f * t3) * (1.0f / 12.0f)) + |
| 181 | + s2 * ((+3.0f + 0.0f * t1 - 7.0f * t2 + 4.0f * t3) * (1.0f / 3.0f)) + |
| 182 | + s3 * ((+0.0f + 2.0f * t1 + 5.0f * t2 - 4.0f * t3) * (1.0f / 3.0f)) + |
| 183 | + s4 * ((+0.0f - 1.0f * t1 - 6.0f * t2 + 7.0f * t3) * (1.0f / 12.0f)) + |
| 184 | + s5 * ((+0.0f + 0.0f * t1 + 1.0f * t2 - 1.0f * t3) * (1.0f / 12.0f)) |
| 185 | + ); |
| 186 | + |
| 187 | + // Update the looping flag occasionally. |
| 188 | + if (!(num_samples & 31)) { |
| 189 | + queue_looping = m_queue_looping.load(std::memory_order_relaxed); |
| 190 | + } |
| 191 | + |
| 192 | + // Apply Fade In / Fade Out depending on if we are looping |
| 193 | + if (queue_looping) |
| 194 | + m_fade_volume += fade_out_mul * (0.0f - m_fade_volume); |
| 195 | + else |
| 196 | + m_fade_volume += fade_in_mul * (1.0f - m_fade_volume); |
| 197 | + |
| 198 | + samples[0] = (int16_t)clamp_value(sample.l * m_fade_volume, -32767.0f, 32767.0f); |
| 199 | + samples[1] = (int16_t)clamp_value(sample.r * m_fade_volume, -32767.0f, 32767.0f); |
| 200 | + |
| 201 | + samples += 2; |
| 202 | + } |
| 203 | +} |
| 204 | + |
| 205 | +void GranularMixer::PushSamples(const s32 *samples, u32 num_samples, float volume) { |
| 206 | + // TODO: This can be massively sped up. Although hardly likely to be a bottleneck. |
| 207 | + while (num_samples-- > 0) { |
| 208 | + const s16 l = clampfloat_s16(samples[0] * volume); |
| 209 | + const s16 r = clampfloat_s16(samples[1] * volume); |
| 210 | + samples += 2; |
| 211 | + |
| 212 | + m_next_buffer[m_next_buffer_index] = StereoPair(l, r); |
| 213 | + m_next_buffer_index = (m_next_buffer_index + 1) & GRANULE_MASK; |
| 214 | + |
| 215 | + // The granules overlap by 50%, so we need to enqueue the |
| 216 | + // next buffer every time we fill half of the samples. |
| 217 | + if (m_next_buffer_index == 0 || m_next_buffer_index == m_next_buffer.size() / 2) { |
| 218 | + Enqueue(); |
| 219 | + } |
| 220 | + } |
| 221 | +} |
| 222 | + |
| 223 | +void GranularMixer::Enqueue() { |
| 224 | + const u32 head = m_queue_head.load(std::memory_order_acquire); |
| 225 | + |
| 226 | + // Check if we run out of space in the circular queue. (rare) |
| 227 | + u32 next_head = head + 1; |
| 228 | + if ((next_head & GRANULE_QUEUE_MASK) == (m_queue_tail.load(std::memory_order_acquire) & GRANULE_QUEUE_MASK)) { |
| 229 | + WARN_LOG(Log::Audio, |
| 230 | + "Granule Queue has completely filled and audio samples are being dropped. " |
| 231 | + "This should not happen unless the audio backend has stopped requesting audio."); |
| 232 | + return; |
| 233 | + } |
| 234 | + |
| 235 | + // The compiler (at least MSVC) fails at optimizing this loop using SIMD instructions. |
| 236 | + const u32 start_index = m_next_buffer_index; |
| 237 | + |
| 238 | + const u32 maskedHead = head & GRANULE_QUEUE_MASK; |
| 239 | + for (u32 i = 0; i < GRANULE_SIZE; ++i) { |
| 240 | + m_queue[maskedHead][i] = m_next_buffer[(i + start_index) & GRANULE_MASK] * g_GranuleWindow[i]; |
| 241 | + } |
| 242 | + |
| 243 | + m_queue_head.store(next_head, std::memory_order_release); |
| 244 | + m_queue_looping.store(false, std::memory_order_relaxed); |
| 245 | +} |
| 246 | + |
| 247 | +void GranularMixer::Dequeue(Granule *granule) { |
| 248 | + const u32 granule_queue_size = m_granule_queue_size.load(std::memory_order_relaxed); |
| 249 | + const u32 head = m_queue_head.load(std::memory_order_acquire); |
| 250 | + u32 tail = m_queue_tail.load(std::memory_order_acquire); |
| 251 | + |
| 252 | + // Checks to see if the queue has gotten too long. |
| 253 | + if ((head - tail) > granule_queue_size) { |
| 254 | + // Jump the playhead to half the queue size behind the head. |
| 255 | + const u32 gap = (granule_queue_size >> 1) + 1; |
| 256 | + tail = (head - gap); |
| 257 | + overruns_++; |
| 258 | + } |
| 259 | + |
| 260 | + // Checks to see if the queue is empty. |
| 261 | + u32 next_tail = tail + 1; |
| 262 | + |
| 263 | + bool looping = m_queue_looping.load(); |
| 264 | + |
| 265 | + /*if (!looping && !smoothedQueueSize_ < granule_queue_size / 2) { |
| 266 | + // Repeat a single block occasionally to make sure we have a reasonably sized queue. |
| 267 | + next_tail = tail; |
| 268 | + } else*/ if (next_tail == head) { |
| 269 | + // Only fill gaps when running to prevent stutter on pause. |
| 270 | + CoreState state = coreState; |
| 271 | + const bool is_running = state == CORE_RUNNING_CPU || state == CORE_RUNNING_GE; |
| 272 | + if (g_Config.bFillAudioGaps && is_running) { |
| 273 | + // Jump the playhead to half the queue size behind the head. |
| 274 | + // This will repeat a few past granules I guess? They still contain sensible data. |
| 275 | + // This provides smoother audio playback than suddenly stopping. |
| 276 | + const u32 gap = std::max<u32>(2, granule_queue_size >> 1) - 1; |
| 277 | + next_tail = head - gap; |
| 278 | + underruns_++; |
| 279 | + m_queue_looping.store(true, std::memory_order_relaxed); |
| 280 | + } else { |
| 281 | + // Send a zero granule. |
| 282 | + std::fill(granule->begin(), granule->end(), StereoPair{ 0.0f, 0.0f }); |
| 283 | + m_queue_looping.store(false, std::memory_order_relaxed); |
| 284 | + return; |
| 285 | + } |
| 286 | + } |
| 287 | + |
| 288 | + *granule = m_queue[tail & GRANULE_QUEUE_MASK]; |
| 289 | + m_queue_tail.store(next_tail, std::memory_order_release); |
| 290 | +} |
| 291 | + |
| 292 | +void GranularMixer::GetStats(GranularStats *stats) { |
| 293 | + stats->queuedGranulesMin = queuedGranulesMin_; |
| 294 | + stats->queuedGranulesMax = queuedGranulesMax_; |
| 295 | + stats->smoothedQueuedGranules = smoothedQueueSize_; |
| 296 | + stats->targetQueueSize = m_granule_queue_size.load(std::memory_order_relaxed); |
| 297 | + stats->maxQueuedGranules = MAX_GRANULE_QUEUE_SIZE; |
| 298 | + stats->fadeVolume = m_fade_volume; |
| 299 | + stats->looping = m_queue_looping; |
| 300 | + stats->overruns = overruns_; |
| 301 | + stats->underruns = underruns_; |
| 302 | + stats->smoothedReadSize = smoothedReadSize_; |
| 303 | + stats->frameTimeEstimate = frameTimeEstimate_; |
| 304 | + stats->queuedSamplesTarget = queuedSamplesTarget_; |
| 305 | + queuedGranulesMin_ = 10000; |
| 306 | + queuedGranulesMax_ = 0; |
| 307 | +} |
0 commit comments