From 3f8781db69cb49475f59808b60e6f3e66ffbca39 Mon Sep 17 00:00:00 2001
From: Ivan Savenko <saven.ivan@gmail.com>
Date: Fri, 3 May 2024 22:22:57 +0300
Subject: [PATCH] Fixed sound extraction, slightly better approach to ffmpeg

---
 client/CMT.cpp                 |  11 --
 client/media/CVideoHandler.cpp | 233 ++++++++++++++++++++++-----------
 client/media/CVideoHandler.h   |  24 ++--
 client/media/IVideoPlayer.h    |   6 +-
 client/widgets/VideoWidget.cpp |   4 +-
 5 files changed, 176 insertions(+), 102 deletions(-)
diff --git a/client/CMT.cpp b/client/CMT.cpp
index 859da2dce..d3aeaed0a 100644
--- a/client/CMT.cpp
+++ b/client/CMT.cpp
@@ -396,20 +396,9 @@ int main(int argc, char * argv[])
 //plays intro, ends when intro is over or button has been pressed (handles events)
 void playIntro()
 {
-	auto audioData = CCS->videoh->getAudio(VideoPath::builtin("3DOLOGO.SMK"));
-	int sound = CCS->soundh->playSound(audioData);
 	if(CCS->videoh->playIntroVideo(VideoPath::builtin("3DOLOGO.SMK")))
-	{
-		audioData = CCS->videoh->getAudio(VideoPath::builtin("NWCLOGO.SMK"));
-		sound = CCS->soundh->playSound(audioData);
 		if (CCS->videoh->playIntroVideo(VideoPath::builtin("NWCLOGO.SMK")))
-		{
-			audioData = CCS->videoh->getAudio(VideoPath::builtin("H3INTRO.SMK"));
-			sound = CCS->soundh->playSound(audioData);
 			CCS->videoh->playIntroVideo(VideoPath::builtin("H3INTRO.SMK"));
-		}
-	}
-	CCS->soundh->stopSound(sound);
 }
 
 static void mainLoop()
diff --git a/client/media/CVideoHandler.cpp b/client/media/CVideoHandler.cpp
index 7ea3c0a2c..ec664ae5b 100644
--- a/client/media/CVideoHandler.cpp
+++ b/client/media/CVideoHandler.cpp
@@ -12,6 +12,9 @@
 
 #ifndef DISABLE_VIDEO
 
+#include "ISoundPlayer.h"
+
+#include "../CGameInfo.h"
 #include "../CMT.h"
 #include "../CPlayerInterface.h"
 #include "../eventsSDL/InputHandler.h"
@@ -81,12 +84,16 @@ static std::unique_ptr<CInputStream> findVideoData(const VideoPath & videoToOpen
 
 void CVideoInstance::open(const VideoPath & videoToOpen)
 {
-	state.videoData = findVideoData(videoToOpen);
+	input = findVideoData(videoToOpen);
+}
 
+void CVideoInstance::openContext(FFMpegStreamState & state)
+{
 	static const int BUFFER_SIZE = 4096;
+	input->seek(0);
 
 	auto * buffer = static_cast<unsigned char *>(av_malloc(BUFFER_SIZE)); // will be freed by ffmpeg
-	state.context = avio_alloc_context(buffer, BUFFER_SIZE, 0, state.videoData.get(), lodRead, nullptr, lodSeek);
+	state.context = avio_alloc_context(buffer, BUFFER_SIZE, 0, input.get(), lodRead, nullptr, lodSeek);
 
 	state.formatContext = avformat_alloc_context();
 	state.formatContext->pb = state.context;
@@ -101,52 +108,55 @@ void CVideoInstance::open(const VideoPath & videoToOpen)
 
 	if(avfopen < 0)
 		throwFFmpegError(findStreamInfo);
-
-	for(int i = 0; i < state.formatContext->nb_streams; i++)
-	{
-		if(state.formatContext->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_VIDEO && video.streamIndex == -1)
-		{
-			openStream(video, i);
-		}
-
-		if(state.formatContext->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO && audio.streamIndex == -1)
-			openStream(audio, i);
-	}
 }
 
-void CVideoInstance::openStream(FFMpegStreamState & streamState, int streamIndex)
+void CVideoInstance::openCodec(FFMpegStreamState & state, int streamIndex)
 {
-	streamState.streamIndex = streamIndex;
+	state.streamIndex = streamIndex;
 
 	// Find the decoder for the stream
-	streamState.codec = avcodec_find_decoder(state.formatContext->streams[streamIndex]->codecpar->codec_id);
+	state.codec = avcodec_find_decoder(state.formatContext->streams[streamIndex]->codecpar->codec_id);
 
-	if(streamState.codec == nullptr)
+	if(state.codec == nullptr)
 		throw std::runtime_error("Unsupported codec");
 
-	streamState.codecContext = avcodec_alloc_context3(streamState.codec);
-	if(streamState.codecContext == nullptr)
+	state.codecContext = avcodec_alloc_context3(state.codec);
+	if(state.codecContext == nullptr)
 		throw std::runtime_error("Failed to create codec context");
 
 	// Get a pointer to the codec context for the video stream
-	int ret = avcodec_parameters_to_context(streamState.codecContext, state.formatContext->streams[streamIndex]->codecpar);
+	int ret = avcodec_parameters_to_context(state.codecContext, state.formatContext->streams[streamIndex]->codecpar);
 	if(ret < 0)
 	{
 		//We cannot get codec from parameters
-		avcodec_free_context(&streamState.codecContext);
+		avcodec_free_context(&state.codecContext);
 		throwFFmpegError(ret);
 	}
 
 	// Open codec
-	ret = avcodec_open2(streamState.codecContext, streamState.codec, nullptr);
+	ret = avcodec_open2(state.codecContext, state.codec, nullptr);
 	if(ret < 0)
 	{
 		// Could not open codec
-		streamState.codec = nullptr;
+		state.codec = nullptr;
 		throwFFmpegError(ret);
 	}
 }
 
+void CVideoInstance::openVideo()
+{
+	openContext(video);
+
+	for(int i = 0; i < video.formatContext->nb_streams; i++)
+	{
+		if(video.formatContext->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_VIDEO)
+		{
+			openCodec(video, i);
+			return;
+		}
+	}
+}
+
 void CVideoInstance::prepareOutput(bool scaleToScreenSize, bool useTextureOutput)
 {
 	if (video.streamIndex == -1)
@@ -204,7 +214,7 @@ bool CVideoInstance::nextFrame()
 
 	for(;;)
 	{
-		int ret = av_read_frame(state.formatContext, &packet);
+		int ret = av_read_frame(video.formatContext, &packet);
 		if(ret < 0)
 		{
 			if(ret == AVERROR_EOF)
@@ -218,11 +228,11 @@ bool CVideoInstance::nextFrame()
 			// Decode video frame
 			int rc = avcodec_send_packet(video.codecContext, &packet);
 			if(rc < 0)
-				throwFFmpegError(ret);
+				throwFFmpegError(rc);
 
 			rc = avcodec_receive_frame(video.codecContext, output.frame);
 			if(rc < 0)
-				throwFFmpegError(ret);
+				throwFFmpegError(rc);
 
 			uint8_t * data[4] = {};
 			int linesize[4] = {};
@@ -276,22 +286,25 @@ void CVideoInstance::close()
 	SDL_DestroyTexture(output.textureRGB);
 	SDL_FreeSurface(output.surface);
 
+	closeState(video);
+}
+
+void CVideoInstance::closeState(FFMpegStreamState & streamState)
+{
 	// state.videoStream.codec???
 	// state.audioStream.codec???
 
 	avcodec_close(video.codecContext);
 	avcodec_free_context(&video.codecContext);
 
-	avcodec_close(audio.codecContext);
-	avcodec_free_context(&audio.codecContext);
+	avcodec_close(video.codecContext);
+	avcodec_free_context(&video.codecContext);
 
-	avformat_close_input(&state.formatContext);
-	av_free(state.context);
+	avformat_close_input(&video.formatContext);
+	av_free(video.context);
 
 	output = FFMpegVideoOutput();
 	video = FFMpegStreamState();
-	audio = FFMpegStreamState();
-	state = FFMpegFileState();
 }
 
 CVideoInstance::~CVideoInstance()
@@ -328,7 +341,7 @@ void CVideoInstance::tick(uint32_t msPassed)
 #	else
 	auto packet_duration = frame->duration;
 #	endif
-	double frameEndTime = (output.frame->pts + packet_duration) * av_q2d(state.formatContext->streams[video.streamIndex]->time_base);
+	double frameEndTime = (output.frame->pts + packet_duration) * av_q2d(video.formatContext->streams[video.streamIndex]->time_base);
 	output.frameTime += msPassed / 1000.0;
 
 	if(output.frameTime >= frameEndTime)
@@ -338,44 +351,118 @@ void CVideoInstance::tick(uint32_t msPassed)
 	}
 }
 
-#	if 0
-
-std::pair<std::unique_ptr<ui8 []>, si64> CVideoPlayer::getAudio(const VideoPath & videoToOpen)
+static int32_t sampleSizeBytes(int audioFormat)
 {
-	std::pair<std::unique_ptr<ui8 []>, si64> dat(std::make_pair(nullptr, 0));
-
-	FFMpegFileState audio;
-	openVideoFile(audio, videoToOpen);
-
-	if (audio.audioStream.streamIndex < 0)
+	switch (audioFormat)
 	{
-		closeVideoFile(audio);
-		return { nullptr, 0};
+		case AV_SAMPLE_FMT_U8:          ///< unsigned 8 bits
+		case AV_SAMPLE_FMT_U8P:         ///< unsigned 8 bits, planar
+			return 1;
+		case AV_SAMPLE_FMT_S16:         ///< signed 16 bits
+		case AV_SAMPLE_FMT_S16P:        ///< signed 16 bits, planar
+			return 2;
+		case AV_SAMPLE_FMT_S32:         ///< signed 32 bits
+		case AV_SAMPLE_FMT_S32P:        ///< signed 32 bits, planar
+		case AV_SAMPLE_FMT_FLT:         ///< float
+		case AV_SAMPLE_FMT_FLTP:        ///< float, planar
+			return 4;
+		case AV_SAMPLE_FMT_DBL:         ///< double
+		case AV_SAMPLE_FMT_DBLP:        ///< double, planar
+		case AV_SAMPLE_FMT_S64:         ///< signed 64 bits
+		case AV_SAMPLE_FMT_S64P:        ///< signed 64 bits, planar
+			return 8;
+	}
+	throw std::runtime_error("Invalid audio format");
+}
+
+static int32_t sampleWavType(int audioFormat)
+{
+	switch (audioFormat)
+	{
+		case AV_SAMPLE_FMT_U8:          ///< unsigned 8 bits
+		case AV_SAMPLE_FMT_U8P:         ///< unsigned 8 bits, planar
+		case AV_SAMPLE_FMT_S16:         ///< signed 16 bits
+		case AV_SAMPLE_FMT_S16P:        ///< signed 16 bits, planar
+		case AV_SAMPLE_FMT_S32:         ///< signed 32 bits
+		case AV_SAMPLE_FMT_S32P:        ///< signed 32 bits, planar
+		case AV_SAMPLE_FMT_S64:         ///< signed 64 bits
+		case AV_SAMPLE_FMT_S64P:        ///< signed 64 bits, planar
+			return 1; // PCM
+
+		case AV_SAMPLE_FMT_FLT:         ///< float
+		case AV_SAMPLE_FMT_FLTP:        ///< float, planar
+		case AV_SAMPLE_FMT_DBL:         ///< double
+		case AV_SAMPLE_FMT_DBLP:        ///< double, planar
+			return 3; // IEEE float
+	}
+	throw std::runtime_error("Invalid audio format");
+}
+
+void CVideoInstance::playAudio()
+{
+	FFMpegStreamState audio;
+
+	openContext(audio);
+
+	for(int i = 0; i < audio.formatContext->nb_streams; i++)
+	{
+		if(audio.formatContext->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO)
+		{
+			openCodec(audio, i);
+			break;
+		}
 	}
 
-	// Open codec
+	std::pair<std::unique_ptr<ui8 []>, si64> dat(std::make_pair(nullptr, 0));
+
+	if (audio.streamIndex < 0)
+		return; // nothing to play
+
+	const auto * codecpar = audio.formatContext->streams[audio.streamIndex]->codecpar;
 	AVFrame *frameAudio = av_frame_alloc();
-		
+	AVFrame *frameVideo = av_frame_alloc();
 	AVPacket packet;
 
 	std::vector<ui8> samples;
 
+	int32_t sampleSize = sampleSizeBytes(codecpar->format);
+
+	samples.reserve(44100 * 5); // arbitrary 5-second buffer
+
 	while (av_read_frame(audio.formatContext, &packet) >= 0)
 	{
-		if(packet.stream_index == audio.audioStream.streamIndex)
+		if (packet.stream_index == video.streamIndex)
 		{
-			int rc = avcodec_send_packet(audio.audioStream.codecContext, &packet);
-			if (rc >= 0)
-				packet.size = 0;
-			rc = avcodec_receive_frame(audio.audioStream.codecContext, frameAudio);
-			int bytesToRead = (frameAudio->nb_samples * 2 * (audio.formatContext->streams[audio.audioStream.streamIndex]->codecpar->bits_per_coded_sample / 8));
-			if (rc >= 0)
-				for (int s = 0; s < bytesToRead; s += sizeof(ui8))
-				{
-					ui8 value;
-					memcpy(&value, &frameAudio->data[0][s], sizeof(ui8));
-					samples.push_back(value);
-				}
+			// Decode video frame
+			int rc = avcodec_send_packet(video.codecContext, &packet);
+			if(rc < 0)
+				throwFFmpegError(rc);
+
+			rc = avcodec_receive_frame(video.codecContext, frameVideo);
+			if(rc < 0)
+				throwFFmpegError(rc);
+		}
+
+		if(packet.stream_index == audio.streamIndex)
+		{
+			int rc = avcodec_send_packet(audio.codecContext, &packet);
+
+			if(rc < 0)
+				throwFFmpegError(rc);
+
+			for (;;)
+			{
+				rc = avcodec_receive_frame(audio.codecContext, frameAudio);
+				if (rc == AVERROR(EAGAIN))
+					break;
+
+				if(rc < 0)
+					throwFFmpegError(rc);
+
+				int bytesToRead = frameAudio->nb_samples * 2 * sampleSize;
+
+				samples.insert(samples.end(), frameAudio->data[0], frameAudio->data[0] + bytesToRead);
+			}
 		}
 		av_packet_unref(&packet);
 	}
@@ -391,16 +478,19 @@ std::pair<std::unique_ptr<ui8 []>, si64> CVideoPlayer::getAudio(const VideoPath
 		ui32 SamplesPerSec = 22050;
 		ui32 bytesPerSec = 22050 * 2;
 		ui16 blockAlign = 2;
-		ui16 bitsPerSample = 16;
+		ui16 bitsPerSample = 32;
 		ui8 Subchunk2ID[4] = {'d', 'a', 't', 'a'};
 		ui32 Subchunk2Size;
 	} wav_hdr;
 
 	wav_hdr wav;
 	wav.ChunkSize = samples.size() + sizeof(wav_hdr) - 8;
-  	wav.Subchunk2Size = samples.size() + sizeof(wav_hdr) - 44;
-	wav.SamplesPerSec = audio.formatContext->streams[audio.audioStream.streamIndex]->codecpar->sample_rate;
-	wav.bitsPerSample = audio.formatContext->streams[audio.audioStream.streamIndex]->codecpar->bits_per_coded_sample;
+	wav.AudioFormat = sampleWavType(codecpar->format);
+	wav.NumOfChan = codecpar->channels;
+	wav.SamplesPerSec = codecpar->sample_rate;
+	wav.bytesPerSec = codecpar->sample_rate * sampleSize;
+	wav.bitsPerSample = sampleSize * 8;
+	wav.Subchunk2Size = samples.size() + sizeof(wav_hdr) - 44;
 	auto wavPtr = reinterpret_cast<ui8*>(&wav);
 
 	dat = std::make_pair(std::make_unique<ui8[]>(samples.size() + sizeof(wav_hdr)), samples.size() + sizeof(wav_hdr));
@@ -410,18 +500,17 @@ std::pair<std::unique_ptr<ui8 []>, si64> CVideoPlayer::getAudio(const VideoPath
 	if (frameAudio)
 		av_frame_free(&frameAudio);
 
-	closeVideoFile(audio);
-
-	return dat;
+	CCS->soundh->playSound(dat);
+	closeState(audio);
 }
 
-#	endif
-
 bool CVideoPlayer::openAndPlayVideoImpl(const VideoPath & name, const Point & position, bool useOverlay, bool scale, bool stopOnKey)
 {
 	CVideoInstance instance;
 
 	instance.open(name);
+	instance.playAudio();
+	instance.openVideo();
 	instance.prepareOutput(scale, useOverlay);
 
 	auto lastTimePoint = boost::chrono::steady_clock::now();
@@ -460,7 +549,7 @@ bool CVideoPlayer::openAndPlayVideoImpl(const VideoPath & name, const Point & po
 #endif
 
 		// Framerate delay
-		double targetFrameTimeSeconds = packet_duration * av_q2d(instance.state.formatContext->streams[instance.video.streamIndex]->time_base);
+		double targetFrameTimeSeconds = packet_duration * av_q2d(instance.video.formatContext->streams[instance.video.streamIndex]->time_base);
 		auto targetFrameTime = boost::chrono::milliseconds(static_cast<int>(1000 * (targetFrameTimeSeconds)));
 
 		auto timePointAfterPresent = boost::chrono::steady_clock::now();
@@ -489,14 +578,10 @@ std::unique_ptr<IVideoInstance> CVideoPlayer::open(const VideoPath & name, bool
 	auto result = std::make_unique<CVideoInstance>();
 
 	result->open(name);
+	result->openVideo();
 	result->prepareOutput(scaleToScreen, false);
 
 	return result;
 }
 
-std::pair<std::unique_ptr<ui8 []>, si64> CVideoPlayer::getAudio(const VideoPath & videoToOpen)
-{
-	return {nullptr, 0};
-}
-
 #endif
diff --git a/client/media/CVideoHandler.h b/client/media/CVideoHandler.h
index 9a578260c..bfb8e865b 100644
--- a/client/media/CVideoHandler.h
+++ b/client/media/CVideoHandler.h
@@ -29,16 +29,12 @@ VCMI_LIB_NAMESPACE_END
 
 struct FFMpegStreamState
 {
-	int streamIndex = -1;
-	const AVCodec * codec = nullptr;
-	AVCodecContext * codecContext = nullptr;
-};
-
-struct FFMpegFileState
-{
-	std::unique_ptr<CInputStream> videoData;
 	AVIOContext * context = nullptr;
 	AVFormatContext * formatContext = nullptr;
+
+	const AVCodec * codec = nullptr;
+	AVCodecContext * codecContext = nullptr;
+	int streamIndex = -1;
 };
 
 struct FFMpegVideoOutput
@@ -59,16 +55,20 @@ class CVideoInstance final : public IVideoInstance
 {
 	friend class CVideoPlayer;
 
-	FFMpegFileState state;
+	std::unique_ptr<CInputStream> input;
+
 	FFMpegStreamState video;
-	FFMpegStreamState audio;
 	FFMpegVideoOutput output;
 
 	void open(const VideoPath & fname);
-	void openStream(FFMpegStreamState & streamState, int streamIndex);
+	void openContext(FFMpegStreamState & streamState);
+	void openCodec(FFMpegStreamState & streamState, int streamIndex);
+	void openVideo();
 	void prepareOutput(bool scaleToScreenSize, bool useTextureOutput);
+
 	bool nextFrame();
 	void close();
+	void closeState(FFMpegStreamState & streamState);
 
 public:
 	~CVideoInstance();
@@ -78,6 +78,7 @@ public:
 
 	void show(const Point & position, Canvas & canvas) final;
 	void tick(uint32_t msPassed) final;
+	void playAudio() final;
 };
 
 class CVideoPlayer final : public IVideoPlayer
@@ -89,7 +90,6 @@ public:
 	bool playIntroVideo(const VideoPath & name) final;
 	void playSpellbookAnimation(const VideoPath & name, const Point & position) final;
 	std::unique_ptr<IVideoInstance> open(const VideoPath & name, bool scaleToScreen) final;
-	std::pair<std::unique_ptr<ui8[]>, si64> getAudio(const VideoPath & videoToOpen) final;
 };
 
 #endif
diff --git a/client/media/IVideoPlayer.h b/client/media/IVideoPlayer.h
index 4df144f4c..a0c1e6dc7 100644
--- a/client/media/IVideoPlayer.h
+++ b/client/media/IVideoPlayer.h
@@ -32,6 +32,9 @@ public:
 	/// Advances video playback by specified duration
 	virtual void tick(uint32_t msPassed) = 0;
 
+	/// Attempts to start audio playback from video, if any exists
+	virtual void playAudio() = 0;
+
 	virtual ~IVideoInstance() = default;
 };
 
@@ -47,8 +50,5 @@ public:
 	/// Load video from specified path. Returns nullptr on failure
 	virtual std::unique_ptr<IVideoInstance> open(const VideoPath & name, bool scaleToScreen) = 0;
 
-	/// Extracts audio data from provided video in wav format. Return nullptr on failure
-	virtual std::pair<std::unique_ptr<ui8[]>, si64> getAudio(const VideoPath & videoToOpen) = 0;
-
 	virtual ~IVideoPlayer() = default;
 };
diff --git a/client/widgets/VideoWidget.cpp b/client/widgets/VideoWidget.cpp
index 380c4f467..76627be5e 100644
--- a/client/widgets/VideoWidget.cpp
+++ b/client/widgets/VideoWidget.cpp
@@ -44,8 +44,8 @@ void VideoWidget::show(Canvas & to)
 
 void VideoWidget::activate()
 {
-	auto audioData = CCS->videoh->getAudio(current);
-	videoSoundHandle = CCS->soundh->playSound(audioData, -1);
+	if(videoInstance)
+		videoInstance->playAudio();
 
 	if(videoSoundHandle != -1)
 	{