From e497a5137310ccabd3fbdce592db295314202df9 Mon Sep 17 00:00:00 2001
From: Ivan Savenko <saven.ivan@gmail.com>
Date: Tue, 1 Oct 2024 15:25:58 +0000
Subject: [PATCH] Use swresample (part of FFmpeg) to de-planarize audio into
 format that can be consumed by SDL

---
 CMakeLists.txt                 |  6 +--
 client/media/CVideoHandler.cpp | 77 ++++++++++++++++++++++++++--------
 2 files changed, 60 insertions(+), 23 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a9918c0a3..31bd3b8f1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -486,11 +486,7 @@ if(NOT FORCE_BUNDLED_MINIZIP)
 endif()
 
 if (ENABLE_CLIENT)
-	set(FFMPEG_COMPONENTS avutil swscale avformat avcodec)
-	if(APPLE_IOS AND NOT USING_CONAN)
-		list(APPEND FFMPEG_COMPONENTS swresample)
-	endif()
-	find_package(ffmpeg COMPONENTS ${FFMPEG_COMPONENTS})
+	find_package(ffmpeg COMPONENTS avutil swscale avformat avcodec swresample)
 
 	find_package(SDL2 REQUIRED)
 	find_package(SDL2_image REQUIRED)
diff --git a/client/media/CVideoHandler.cpp b/client/media/CVideoHandler.cpp
index 51e17ff10..9428c7428 100644
--- a/client/media/CVideoHandler.cpp
+++ b/client/media/CVideoHandler.cpp
@@ -33,7 +33,9 @@ extern "C" {
 #include <libavformat/avformat.h>
 #include <libavcodec/avcodec.h>
 #include <libavutil/imgutils.h>
+#include <libavutil/opt.h>
 #include <libswscale/swscale.h>
+#include <libswresample/swresample.h>
 }
 
 // Define a set of functions to read data
@@ -501,32 +503,71 @@ std::pair<std::unique_ptr<ui8 []>, si64> CAudioInstance::extractAudio(const Vide
 	int numChannels = codecpar->ch_layout.nb_channels;
 #endif
 
-	samples.reserve(44100 * 5); // arbitrary 5-second buffer
+	samples.reserve(44100 * 5); // arbitrary 5-second buffer to reduce reallocations
 
-	for (;;)
+	if (formatProperties.isPlanar && numChannels > 1)
 	{
-		decodeNextFrame();
-		const AVFrame * frame = getCurrentFrame();
+		// Format is 'planar', which is not supported by wav / SDL
+		// Use swresample part of ffmpeg to deplanarize audio into format supported by wav / SDL
 
-		if (!frame)
-			break;
+		auto sourceFormat = static_cast<AVSampleFormat>(codecpar->format);
+		auto targetFormat = av_get_alt_sample_fmt(sourceFormat, false);
 
-		int samplesToRead = frame->nb_samples * numChannels;
-		int bytesToRead = samplesToRead * formatProperties.sampleSizeBytes;
+		SwrContext * swr_ctx = swr_alloc();
 
-		if (formatProperties.isPlanar && numChannels > 1)
+#if (LIBAVUTIL_VERSION_MAJOR < 58)
+		av_opt_set_channel_layout(swr_ctx, "in_chlayout", codecpar->channel_layout, 0);
+		av_opt_set_channel_layout(swr_ctx, "out_chlayout", codecpar->channel_layout, 0);
+#else
+		av_opt_set_chlayout(swr_ctx, "in_chlayout", &codecpar->ch_layout, 0);
+		av_opt_set_chlayout(swr_ctx, "out_chlayout", &codecpar->ch_layout, 0);
+#endif
+		av_opt_set_int(swr_ctx, "in_sample_rate", codecpar->sample_rate, 0);
+		av_opt_set_sample_fmt(swr_ctx, "in_sample_fmt", sourceFormat, 0);
+		av_opt_set_int(swr_ctx, "out_sample_rate", codecpar->sample_rate, 0);
+		av_opt_set_sample_fmt(swr_ctx, "out_sample_fmt", targetFormat, 0);
+
+		int initResult = swr_init(swr_ctx);
+		if (initResult < 0)
+			throwFFmpegError(initResult);
+
+		std::vector<uint8_t> frameSamplesBuffer;
+		for (;;)
 		{
-			// Workaround for lack of resampler
-			// Currently, ffmpeg on conan systems is built without sws resampler
-			// Because of that, and because wav format does not supports 'planar' formats from ffmpeg
-			// we need to de-planarize it and convert to "normal" (non-planar / interleaved) stream
-			samples.reserve(samples.size() + bytesToRead);
-			for (int sm = 0; sm < frame->nb_samples; ++sm)
-				for (int ch = 0; ch < numChannels; ++ch)
-					samples.insert(samples.end(), frame->data[ch] + sm * formatProperties.sampleSizeBytes, frame->data[ch] + (sm+1) * formatProperties.sampleSizeBytes );
+			decodeNextFrame();
+			const AVFrame * frame = getCurrentFrame();
+
+			if (!frame)
+				break;
+
+			size_t samplesToRead = frame->nb_samples * numChannels;
+			size_t bytesToRead = samplesToRead * formatProperties.sampleSizeBytes;
+			frameSamplesBuffer.resize(std::max(frameSamplesBuffer.size(), bytesToRead));
+			uint8_t * frameSamplesPtr = frameSamplesBuffer.data();
+
+			int result = swr_convert(swr_ctx, &frameSamplesPtr, frame->nb_samples, (const uint8_t **)frame->data, frame->nb_samples);
+
+			if (result < 0)
+				throwFFmpegError(result);
+
+			size_t samplesToCopy = result * numChannels;
+			size_t bytesToCopy = samplesToCopy * formatProperties.sampleSizeBytes;
+			samples.insert(samples.end(), frameSamplesBuffer.begin(), frameSamplesBuffer.begin() + bytesToCopy);
 		}
-		else
+		swr_free(&swr_ctx);
+	}
+	else
+	{
+		for (;;)
 		{
+			decodeNextFrame();
+			const AVFrame * frame = getCurrentFrame();
+
+			if (!frame)
+				break;
+
+			size_t samplesToRead = frame->nb_samples * numChannels;
+			size_t bytesToRead = samplesToRead * formatProperties.sampleSizeBytes;
 			samples.insert(samples.end(), frame->data[0], frame->data[0] + bytesToRead);
 		}
 	}