1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-11-23 21:54:53 +02:00
Files
FFmpeg/libavformat/iamf_parse.c

1226 lines
43 KiB
C
Raw Normal View History

/*
* Immersive Audio Model and Formats parsing
* Copyright (c) 2023 James Almer <jamrial@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/avassert.h"
#include "libavutil/iamf.h"
#include "libavutil/intreadwrite.h"
#include "libavutil/log.h"
#include "libavutil/mem.h"
#include "libavcodec/get_bits.h"
#include "libavcodec/flac.h"
#include "libavcodec/leb.h"
#include "libavcodec/mpeg4audio.h"
#include "libavcodec/put_bits.h"
#include "avio_internal.h"
#include "iamf_parse.h"
#include "isom.h"
static int opus_decoder_config(IAMFCodecConfig *codec_config,
AVIOContext *pb, int len)
{
int ret, left = len - avio_tell(pb);
if (left < 11 || codec_config->audio_roll_distance >= 0)
return AVERROR_INVALIDDATA;
codec_config->extradata = av_malloc(left + 8);
if (!codec_config->extradata)
return AVERROR(ENOMEM);
AV_WB32A(codec_config->extradata, MKBETAG('O','p','u','s'));
AV_WB32A(codec_config->extradata + 4, MKBETAG('H','e','a','d'));
ret = ffio_read_size(pb, codec_config->extradata + 8, left);
if (ret < 0)
return ret;
codec_config->extradata_size = left + 8;
codec_config->sample_rate = 48000;
return 0;
}
static int aac_decoder_config(IAMFCodecConfig *codec_config,
AVIOContext *pb, int len, void *logctx)
{
MPEG4AudioConfig cfg = { 0 };
int object_type_id, codec_id, stream_type;
int ret, tag, left;
if (codec_config->audio_roll_distance >= 0)
return AVERROR_INVALIDDATA;
ff_mp4_read_descr(logctx, pb, &tag);
if (tag != MP4DecConfigDescrTag)
return AVERROR_INVALIDDATA;
object_type_id = avio_r8(pb);
if (object_type_id != 0x40)
return AVERROR_INVALIDDATA;
stream_type = avio_r8(pb);
if (((stream_type >> 2) != 5) || ((stream_type >> 1) & 1))
return AVERROR_INVALIDDATA;
avio_skip(pb, 3); // buffer size db
avio_skip(pb, 4); // rc_max_rate
avio_skip(pb, 4); // avg bitrate
codec_id = ff_codec_get_id(ff_mp4_obj_type, object_type_id);
if (codec_id && codec_id != codec_config->codec_id)
return AVERROR_INVALIDDATA;
left = ff_mp4_read_descr(logctx, pb, &tag);
if (tag != MP4DecSpecificDescrTag ||
!left || left > (len - avio_tell(pb)))
return AVERROR_INVALIDDATA;
// We pad extradata here because avpriv_mpeg4audio_get_config2() needs it.
codec_config->extradata = av_malloc((size_t)left + AV_INPUT_BUFFER_PADDING_SIZE);
if (!codec_config->extradata)
return AVERROR(ENOMEM);
ret = ffio_read_size(pb, codec_config->extradata, left);
if (ret < 0)
return ret;
codec_config->extradata_size = left;
memset(codec_config->extradata + codec_config->extradata_size, 0,
AV_INPUT_BUFFER_PADDING_SIZE);
ret = avpriv_mpeg4audio_get_config2(&cfg, codec_config->extradata,
codec_config->extradata_size, 1, logctx);
if (ret < 0)
return ret;
codec_config->sample_rate = cfg.sample_rate;
return 0;
}
static int flac_decoder_config(IAMFCodecConfig *codec_config,
AVIOContext *pb, int len)
{
int ret, left;
if (codec_config->audio_roll_distance)
return AVERROR_INVALIDDATA;
avio_skip(pb, 4); // METADATA_BLOCK_HEADER
left = len - avio_tell(pb);
if (left < FLAC_STREAMINFO_SIZE)
return AVERROR_INVALIDDATA;
codec_config->extradata = av_malloc(left);
if (!codec_config->extradata)
return AVERROR(ENOMEM);
ret = ffio_read_size(pb, codec_config->extradata, left);
if (ret < 0)
return ret;
codec_config->extradata_size = left;
codec_config->sample_rate = AV_RB24(codec_config->extradata + 10) >> 4;
return 0;
}
static int ipcm_decoder_config(IAMFCodecConfig *codec_config,
AVIOContext *pb, int len)
{
static const enum AVCodecID sample_fmt[2][3] = {
{ AV_CODEC_ID_PCM_S16BE, AV_CODEC_ID_PCM_S24BE, AV_CODEC_ID_PCM_S32BE },
{ AV_CODEC_ID_PCM_S16LE, AV_CODEC_ID_PCM_S24LE, AV_CODEC_ID_PCM_S32LE },
};
int sample_format = avio_r8(pb); // 0 = BE, 1 = LE
int sample_size = (avio_r8(pb) / 8 - 2); // 16, 24, 32
if (sample_format > 1 || sample_size > 2U || codec_config->audio_roll_distance)
return AVERROR_INVALIDDATA;
codec_config->codec_id = sample_fmt[sample_format][sample_size];
codec_config->sample_rate = avio_rb32(pb);
if (len - avio_tell(pb))
return AVERROR_INVALIDDATA;
return 0;
}
static int codec_config_obu(void *s, IAMFContext *c, AVIOContext *pb, int len)
{
IAMFCodecConfig **tmp, *codec_config = NULL;
FFIOContext b;
AVIOContext *pbc;
uint8_t *buf;
enum AVCodecID avcodec_id;
unsigned codec_config_id, nb_samples, codec_id;
int16_t audio_roll_distance;
int ret;
buf = av_malloc(len);
if (!buf)
return AVERROR(ENOMEM);
ret = ffio_read_size(pb, buf, len);
if (ret < 0)
goto fail;
ffio_init_context(&b, buf, len, 0, NULL, NULL, NULL, NULL);
pbc = &b.pub;
codec_config_id = ffio_read_leb(pbc);
codec_id = avio_rb32(pbc);
nb_samples = ffio_read_leb(pbc);
audio_roll_distance = avio_rb16(pbc);
switch(codec_id) {
case MKBETAG('O','p','u','s'):
avcodec_id = AV_CODEC_ID_OPUS;
break;
case MKBETAG('m','p','4','a'):
avcodec_id = AV_CODEC_ID_AAC;
break;
case MKBETAG('f','L','a','C'):
avcodec_id = AV_CODEC_ID_FLAC;
break;
default:
avcodec_id = AV_CODEC_ID_NONE;
break;
}
for (int i = 0; i < c->nb_codec_configs; i++)
if (c->codec_configs[i]->codec_config_id == codec_config_id) {
ret = AVERROR_INVALIDDATA;
goto fail;
}
tmp = av_realloc_array(c->codec_configs, c->nb_codec_configs + 1, sizeof(*c->codec_configs));
if (!tmp) {
ret = AVERROR(ENOMEM);
goto fail;
}
c->codec_configs = tmp;
codec_config = av_mallocz(sizeof(*codec_config));
if (!codec_config) {
ret = AVERROR(ENOMEM);
goto fail;
}
codec_config->codec_config_id = codec_config_id;
codec_config->codec_id = avcodec_id;
codec_config->nb_samples = nb_samples;
codec_config->audio_roll_distance = audio_roll_distance;
switch(codec_id) {
case MKBETAG('O','p','u','s'):
ret = opus_decoder_config(codec_config, pbc, len);
break;
case MKBETAG('m','p','4','a'):
ret = aac_decoder_config(codec_config, pbc, len, s);
break;
case MKBETAG('f','L','a','C'):
ret = flac_decoder_config(codec_config, pbc, len);
break;
case MKBETAG('i','p','c','m'):
ret = ipcm_decoder_config(codec_config, pbc, len);
break;
default:
break;
}
if (ret < 0)
goto fail;
if ((codec_config->nb_samples > INT_MAX) || codec_config->nb_samples <= 0 ||
(-codec_config->audio_roll_distance > INT_MAX / codec_config->nb_samples)) {
ret = AVERROR_INVALIDDATA;
goto fail;
}
c->codec_configs[c->nb_codec_configs++] = codec_config;
len -= avio_tell(pbc);
if (len)
av_log(s, AV_LOG_WARNING, "Underread in codec_config_obu. %d bytes left at the end\n", len);
ret = 0;
fail:
av_free(buf);
if (ret < 0) {
if (codec_config)
av_free(codec_config->extradata);
av_free(codec_config);
}
return ret;
}
static int update_extradata(AVCodecParameters *codecpar)
{
GetBitContext gb;
PutBitContext pb;
int ret;
switch(codecpar->codec_id) {
case AV_CODEC_ID_OPUS:
AV_WB8(codecpar->extradata + 9, codecpar->ch_layout.nb_channels);
AV_WL16A(codecpar->extradata + 10, AV_RB16A(codecpar->extradata + 10)); // Byte swap pre-skip
AV_WL32A(codecpar->extradata + 12, AV_RB32A(codecpar->extradata + 12)); // Byte swap sample rate
AV_WL16A(codecpar->extradata + 16, AV_RB16A(codecpar->extradata + 16)); // Byte swap Output Gain
break;
case AV_CODEC_ID_AAC: {
uint8_t buf[6];
int size = FFMIN(codecpar->extradata_size, sizeof(buf));
init_put_bits(&pb, buf, sizeof(buf));
ret = init_get_bits8(&gb, codecpar->extradata, size);
if (ret < 0)
return ret;
ret = get_bits(&gb, 5);
put_bits(&pb, 5, ret);
if (ret == AOT_ESCAPE) // violates section 3.11.2, but better check for it
put_bits(&pb, 6, get_bits(&gb, 6));
ret = get_bits(&gb, 4);
put_bits(&pb, 4, ret);
if (ret == 0x0f)
put_bits(&pb, 24, get_bits(&gb, 24));
skip_bits(&gb, 4);
put_bits(&pb, 4, codecpar->ch_layout.nb_channels); // set channel config
ret = get_bits_left(&gb);
if (ret < 0)
return AVERROR_INVALIDDATA;
ret = FFMIN(ret, put_bits_left(&pb));
while (ret >= 32) {
put_bits32(&pb, get_bits_long(&gb, 32));
ret -= 32;
}
put_bits(&pb, ret, get_bits_long(&gb, ret));
flush_put_bits(&pb);
memcpy(codecpar->extradata, buf, put_bytes_output(&pb));
break;
}
case AV_CODEC_ID_FLAC: {
uint8_t buf[13];
int size = FFMIN(codecpar->extradata_size, sizeof(buf));
init_put_bits(&pb, buf, sizeof(buf));
ret = init_get_bits8(&gb, codecpar->extradata, size);
if (ret < 0)
return ret;
put_bits32(&pb, get_bits_long(&gb, 32)); // min/max blocksize
put_bits63(&pb, 48, get_bits64(&gb, 48)); // min/max framesize
put_bits(&pb, 20, get_bits(&gb, 20)); // samplerate
skip_bits(&gb, 3);
put_bits(&pb, 3, codecpar->ch_layout.nb_channels - 1);
ret = get_bits_left(&gb);
if (ret < 0)
return AVERROR_INVALIDDATA;
ret = FFMIN(ret, put_bits_left(&pb));
put_bits(&pb, ret, get_bits(&gb, ret));
flush_put_bits(&pb);
memcpy(codecpar->extradata, buf, put_bytes_output(&pb));
break;
}
}
return 0;
}
static int scalable_channel_layout_config(void *s, AVIOContext *pb,
IAMFAudioElement *audio_element,
const IAMFCodecConfig *codec_config)
{
int nb_layers, k = 0;
nb_layers = avio_r8(pb) >> 5; // get_bits(&gb, 3);
// skip_bits(&gb, 5); //reserved
if (nb_layers > 6 || nb_layers == 0)
return AVERROR_INVALIDDATA;
audio_element->layers = av_calloc(nb_layers, sizeof(*audio_element->layers));
if (!audio_element->layers)
return AVERROR(ENOMEM);
audio_element->nb_layers = nb_layers;
avformat/iamf: fix setting channel layout for Scalable layers The way streams are coded in an IAMF struct follows a scalable model where the channel layouts for each layer may not match the channel order our API can represent in a Native order layout. For example, an audio element may have six coded streams in the form of two stereo streams, followed by two mono streams, and then by another two stereo streams, for a total of 10 channels, and define for them four scalable layers with loudspeaker_layout values "Stereo", "5.1ch", "5.1.2ch", and "5.1.4ch". The first layer references the first stream, and each following layer will reference all previous streams plus extra ones. In this case, the "5.1ch" layer will reference four streams (the first two stereo and the two mono) to encompass six channels, which does not match out native layout 5.1(side) given that FC and LFE come after FL+FR but before SL+SR, and here, they are at the end. For this reason, we need to build Custom order layouts that properly represent what we're exporting. ---- Before: Stream group #0:0[0x12c]: IAMF Audio Element: Layer 0: stereo Stream #0:0[0x0]: Audio: opus, 48000 Hz, stereo, fltp (default) Layer 1: 5.1(side) Stream #0:0[0x0]: Audio: opus, 48000 Hz, stereo, fltp (default) Stream #0:1[0x1]: Audio: opus, 48000 Hz, stereo, fltp (dependent) Stream #0:2[0x2]: Audio: opus, 48000 Hz, mono, fltp (dependent) Stream #0:3[0x3]: Audio: opus, 48000 Hz, mono, fltp (dependent) Layer 2: 5.1.2 Stream #0:0[0x0]: Audio: opus, 48000 Hz, stereo, fltp (default) Stream #0:1[0x1]: Audio: opus, 48000 Hz, stereo, fltp (dependent) Stream #0:2[0x2]: Audio: opus, 48000 Hz, mono, fltp (dependent) Stream #0:3[0x3]: Audio: opus, 48000 Hz, mono, fltp (dependent) Stream #0:4[0x4]: Audio: opus, 48000 Hz, stereo, fltp (dependent) Layer 3: 5.1.4 Stream #0:0[0x0]: Audio: opus, 48000 Hz, stereo, fltp (default) Stream #0:1[0x1]: Audio: opus, 48000 Hz, stereo, fltp (dependent) Stream #0:2[0x2]: Audio: opus, 48000 Hz, mono, fltp (dependent) Stream #0:3[0x3]: Audio: opus, 48000 Hz, mono, fltp (dependent) Stream #0:4[0x4]: Audio: opus, 48000 Hz, stereo, fltp (dependent) Stream #0:5[0x5]: Audio: opus, 48000 Hz, stereo, fltp (dependent) ---- AFter: Stream group #0:0[0x12c]: IAMF Audio Element: Layer 0: stereo Stream #0:0[0x0]: Audio: opus, 48000 Hz, stereo, fltp (default) Layer 1: 6 channels (FL+FR+SL+SR+FC+LFE) Stream #0:0[0x0]: Audio: opus, 48000 Hz, stereo, fltp (default) Stream #0:1[0x1]: Audio: opus, 48000 Hz, stereo, fltp (dependent) Stream #0:2[0x2]: Audio: opus, 48000 Hz, mono, fltp (dependent) Stream #0:3[0x3]: Audio: opus, 48000 Hz, mono, fltp (dependent) Layer 2: 8 channels (FL+FR+SL+SR+FC+LFE+TFL+TFR) Stream #0:0[0x0]: Audio: opus, 48000 Hz, stereo, fltp (default) Stream #0:1[0x1]: Audio: opus, 48000 Hz, stereo, fltp (dependent) Stream #0:2[0x2]: Audio: opus, 48000 Hz, mono, fltp (dependent) Stream #0:3[0x3]: Audio: opus, 48000 Hz, mono, fltp (dependent) Stream #0:4[0x4]: Audio: opus, 48000 Hz, stereo, fltp (dependent) Layer 3: 10 channels (FL+FR+SL+SR+FC+LFE+TFL+TFR+TBL+TBR) Stream #0:0[0x0]: Audio: opus, 48000 Hz, stereo, fltp (default) Stream #0:1[0x1]: Audio: opus, 48000 Hz, stereo, fltp (dependent) Stream #0:2[0x2]: Audio: opus, 48000 Hz, mono, fltp (dependent) Stream #0:3[0x3]: Audio: opus, 48000 Hz, mono, fltp (dependent) Stream #0:4[0x4]: Audio: opus, 48000 Hz, stereo, fltp (dependent) Stream #0:5[0x5]: Audio: opus, 48000 Hz, stereo, fltp (dependent) Signed-off-by: James Almer <jamrial@gmail.com>
2025-06-16 21:33:26 -03:00
for (int i = 0, n = 0; i < nb_layers; i++) {
AVChannelLayout ch_layout = { 0 };
AVIAMFLayer *layer;
int loudspeaker_layout, output_gain_is_present_flag;
int substream_count, coupled_substream_count;
int expanded_loudspeaker_layout = -1;
int ret, byte = avio_r8(pb);
layer = av_iamf_audio_element_add_layer(audio_element->element);
if (!layer)
return AVERROR(ENOMEM);
loudspeaker_layout = byte >> 4; // get_bits(&gb, 4);
output_gain_is_present_flag = (byte >> 3) & 1; //get_bits1(&gb);
if ((byte >> 2) & 1)
layer->flags |= AV_IAMF_LAYER_FLAG_RECON_GAIN;
substream_count = avio_r8(pb);
coupled_substream_count = avio_r8(pb);
if (substream_count + k > audio_element->nb_substreams)
return AVERROR_INVALIDDATA;
audio_element->layers[i].substream_count = substream_count;
audio_element->layers[i].coupled_substream_count = coupled_substream_count;
if (output_gain_is_present_flag) {
layer->output_gain_flags = avio_r8(pb) >> 2; // get_bits(&gb, 6);
layer->output_gain = av_make_q(sign_extend(avio_rb16(pb), 16), 1 << 8);
}
if (!i && loudspeaker_layout == 15)
expanded_loudspeaker_layout = avio_r8(pb);
avformat/iamf: fix setting channel layout for Scalable layers The way streams are coded in an IAMF struct follows a scalable model where the channel layouts for each layer may not match the channel order our API can represent in a Native order layout. For example, an audio element may have six coded streams in the form of two stereo streams, followed by two mono streams, and then by another two stereo streams, for a total of 10 channels, and define for them four scalable layers with loudspeaker_layout values "Stereo", "5.1ch", "5.1.2ch", and "5.1.4ch". The first layer references the first stream, and each following layer will reference all previous streams plus extra ones. In this case, the "5.1ch" layer will reference four streams (the first two stereo and the two mono) to encompass six channels, which does not match out native layout 5.1(side) given that FC and LFE come after FL+FR but before SL+SR, and here, they are at the end. For this reason, we need to build Custom order layouts that properly represent what we're exporting. ---- Before: Stream group #0:0[0x12c]: IAMF Audio Element: Layer 0: stereo Stream #0:0[0x0]: Audio: opus, 48000 Hz, stereo, fltp (default) Layer 1: 5.1(side) Stream #0:0[0x0]: Audio: opus, 48000 Hz, stereo, fltp (default) Stream #0:1[0x1]: Audio: opus, 48000 Hz, stereo, fltp (dependent) Stream #0:2[0x2]: Audio: opus, 48000 Hz, mono, fltp (dependent) Stream #0:3[0x3]: Audio: opus, 48000 Hz, mono, fltp (dependent) Layer 2: 5.1.2 Stream #0:0[0x0]: Audio: opus, 48000 Hz, stereo, fltp (default) Stream #0:1[0x1]: Audio: opus, 48000 Hz, stereo, fltp (dependent) Stream #0:2[0x2]: Audio: opus, 48000 Hz, mono, fltp (dependent) Stream #0:3[0x3]: Audio: opus, 48000 Hz, mono, fltp (dependent) Stream #0:4[0x4]: Audio: opus, 48000 Hz, stereo, fltp (dependent) Layer 3: 5.1.4 Stream #0:0[0x0]: Audio: opus, 48000 Hz, stereo, fltp (default) Stream #0:1[0x1]: Audio: opus, 48000 Hz, stereo, fltp (dependent) Stream #0:2[0x2]: Audio: opus, 48000 Hz, mono, fltp (dependent) Stream #0:3[0x3]: Audio: opus, 48000 Hz, mono, fltp (dependent) Stream #0:4[0x4]: Audio: opus, 48000 Hz, stereo, fltp (dependent) Stream #0:5[0x5]: Audio: opus, 48000 Hz, stereo, fltp (dependent) ---- AFter: Stream group #0:0[0x12c]: IAMF Audio Element: Layer 0: stereo Stream #0:0[0x0]: Audio: opus, 48000 Hz, stereo, fltp (default) Layer 1: 6 channels (FL+FR+SL+SR+FC+LFE) Stream #0:0[0x0]: Audio: opus, 48000 Hz, stereo, fltp (default) Stream #0:1[0x1]: Audio: opus, 48000 Hz, stereo, fltp (dependent) Stream #0:2[0x2]: Audio: opus, 48000 Hz, mono, fltp (dependent) Stream #0:3[0x3]: Audio: opus, 48000 Hz, mono, fltp (dependent) Layer 2: 8 channels (FL+FR+SL+SR+FC+LFE+TFL+TFR) Stream #0:0[0x0]: Audio: opus, 48000 Hz, stereo, fltp (default) Stream #0:1[0x1]: Audio: opus, 48000 Hz, stereo, fltp (dependent) Stream #0:2[0x2]: Audio: opus, 48000 Hz, mono, fltp (dependent) Stream #0:3[0x3]: Audio: opus, 48000 Hz, mono, fltp (dependent) Stream #0:4[0x4]: Audio: opus, 48000 Hz, stereo, fltp (dependent) Layer 3: 10 channels (FL+FR+SL+SR+FC+LFE+TFL+TFR+TBL+TBR) Stream #0:0[0x0]: Audio: opus, 48000 Hz, stereo, fltp (default) Stream #0:1[0x1]: Audio: opus, 48000 Hz, stereo, fltp (dependent) Stream #0:2[0x2]: Audio: opus, 48000 Hz, mono, fltp (dependent) Stream #0:3[0x3]: Audio: opus, 48000 Hz, mono, fltp (dependent) Stream #0:4[0x4]: Audio: opus, 48000 Hz, stereo, fltp (dependent) Stream #0:5[0x5]: Audio: opus, 48000 Hz, stereo, fltp (dependent) Signed-off-by: James Almer <jamrial@gmail.com>
2025-06-16 21:33:26 -03:00
if (expanded_loudspeaker_layout > 0 && expanded_loudspeaker_layout < 13) {
av_channel_layout_copy(&ch_layout, &ff_iamf_expanded_scalable_ch_layouts[expanded_loudspeaker_layout]);
} else if (loudspeaker_layout < 10) {
av_channel_layout_copy(&ch_layout, &ff_iamf_scalable_ch_layouts[loudspeaker_layout]);
if (i)
ch_layout.u.mask &= ~av_channel_layout_subset(&audio_element->element->layers[i-1]->ch_layout, UINT64_MAX);
} else
ch_layout = (AVChannelLayout){ .order = AV_CHANNEL_ORDER_UNSPEC,
.nb_channels = substream_count +
coupled_substream_count };
for (int j = 0; j < substream_count; j++) {
IAMFSubStream *substream = &audio_element->substreams[k++];
substream->codecpar->ch_layout = coupled_substream_count-- > 0 ? (AVChannelLayout)AV_CHANNEL_LAYOUT_STEREO :
(AVChannelLayout)AV_CHANNEL_LAYOUT_MONO;
ret = update_extradata(substream->codecpar);
if (ret < 0)
return ret;
}
avformat/iamf: fix setting channel layout for Scalable layers The way streams are coded in an IAMF struct follows a scalable model where the channel layouts for each layer may not match the channel order our API can represent in a Native order layout. For example, an audio element may have six coded streams in the form of two stereo streams, followed by two mono streams, and then by another two stereo streams, for a total of 10 channels, and define for them four scalable layers with loudspeaker_layout values "Stereo", "5.1ch", "5.1.2ch", and "5.1.4ch". The first layer references the first stream, and each following layer will reference all previous streams plus extra ones. In this case, the "5.1ch" layer will reference four streams (the first two stereo and the two mono) to encompass six channels, which does not match out native layout 5.1(side) given that FC and LFE come after FL+FR but before SL+SR, and here, they are at the end. For this reason, we need to build Custom order layouts that properly represent what we're exporting. ---- Before: Stream group #0:0[0x12c]: IAMF Audio Element: Layer 0: stereo Stream #0:0[0x0]: Audio: opus, 48000 Hz, stereo, fltp (default) Layer 1: 5.1(side) Stream #0:0[0x0]: Audio: opus, 48000 Hz, stereo, fltp (default) Stream #0:1[0x1]: Audio: opus, 48000 Hz, stereo, fltp (dependent) Stream #0:2[0x2]: Audio: opus, 48000 Hz, mono, fltp (dependent) Stream #0:3[0x3]: Audio: opus, 48000 Hz, mono, fltp (dependent) Layer 2: 5.1.2 Stream #0:0[0x0]: Audio: opus, 48000 Hz, stereo, fltp (default) Stream #0:1[0x1]: Audio: opus, 48000 Hz, stereo, fltp (dependent) Stream #0:2[0x2]: Audio: opus, 48000 Hz, mono, fltp (dependent) Stream #0:3[0x3]: Audio: opus, 48000 Hz, mono, fltp (dependent) Stream #0:4[0x4]: Audio: opus, 48000 Hz, stereo, fltp (dependent) Layer 3: 5.1.4 Stream #0:0[0x0]: Audio: opus, 48000 Hz, stereo, fltp (default) Stream #0:1[0x1]: Audio: opus, 48000 Hz, stereo, fltp (dependent) Stream #0:2[0x2]: Audio: opus, 48000 Hz, mono, fltp (dependent) Stream #0:3[0x3]: Audio: opus, 48000 Hz, mono, fltp (dependent) Stream #0:4[0x4]: Audio: opus, 48000 Hz, stereo, fltp (dependent) Stream #0:5[0x5]: Audio: opus, 48000 Hz, stereo, fltp (dependent) ---- AFter: Stream group #0:0[0x12c]: IAMF Audio Element: Layer 0: stereo Stream #0:0[0x0]: Audio: opus, 48000 Hz, stereo, fltp (default) Layer 1: 6 channels (FL+FR+SL+SR+FC+LFE) Stream #0:0[0x0]: Audio: opus, 48000 Hz, stereo, fltp (default) Stream #0:1[0x1]: Audio: opus, 48000 Hz, stereo, fltp (dependent) Stream #0:2[0x2]: Audio: opus, 48000 Hz, mono, fltp (dependent) Stream #0:3[0x3]: Audio: opus, 48000 Hz, mono, fltp (dependent) Layer 2: 8 channels (FL+FR+SL+SR+FC+LFE+TFL+TFR) Stream #0:0[0x0]: Audio: opus, 48000 Hz, stereo, fltp (default) Stream #0:1[0x1]: Audio: opus, 48000 Hz, stereo, fltp (dependent) Stream #0:2[0x2]: Audio: opus, 48000 Hz, mono, fltp (dependent) Stream #0:3[0x3]: Audio: opus, 48000 Hz, mono, fltp (dependent) Stream #0:4[0x4]: Audio: opus, 48000 Hz, stereo, fltp (dependent) Layer 3: 10 channels (FL+FR+SL+SR+FC+LFE+TFL+TFR+TBL+TBR) Stream #0:0[0x0]: Audio: opus, 48000 Hz, stereo, fltp (default) Stream #0:1[0x1]: Audio: opus, 48000 Hz, stereo, fltp (dependent) Stream #0:2[0x2]: Audio: opus, 48000 Hz, mono, fltp (dependent) Stream #0:3[0x3]: Audio: opus, 48000 Hz, mono, fltp (dependent) Stream #0:4[0x4]: Audio: opus, 48000 Hz, stereo, fltp (dependent) Stream #0:5[0x5]: Audio: opus, 48000 Hz, stereo, fltp (dependent) Signed-off-by: James Almer <jamrial@gmail.com>
2025-06-16 21:33:26 -03:00
if (ch_layout.order == AV_CHANNEL_ORDER_NATIVE) {
ret = av_channel_layout_custom_init(&layer->ch_layout, ch_layout.nb_channels);
if (ret < 0)
return ret;
for (int j = 0; j < n; j++)
layer->ch_layout.u.map[j].id = av_channel_layout_channel_from_index(&audio_element->element->layers[i-1]->ch_layout, j);
coupled_substream_count = audio_element->layers[i].coupled_substream_count;
while (coupled_substream_count--) {
if (ch_layout.u.mask & AV_CH_LAYOUT_STEREO) {
layer->ch_layout.u.map[n++].id = AV_CHAN_FRONT_LEFT;
layer->ch_layout.u.map[n++].id = AV_CHAN_FRONT_RIGHT;
ch_layout.u.mask &= ~AV_CH_LAYOUT_STEREO;
} else if (ch_layout.u.mask & (AV_CH_FRONT_LEFT_OF_CENTER|AV_CH_FRONT_RIGHT_OF_CENTER)) {
layer->ch_layout.u.map[n++].id = AV_CHAN_FRONT_LEFT_OF_CENTER;
layer->ch_layout.u.map[n++].id = AV_CHAN_FRONT_RIGHT_OF_CENTER;
ch_layout.u.mask &= ~(AV_CH_FRONT_LEFT_OF_CENTER|AV_CH_FRONT_RIGHT_OF_CENTER);
} else if (ch_layout.u.mask & (AV_CH_SIDE_LEFT|AV_CH_SIDE_RIGHT)) {
layer->ch_layout.u.map[n++].id = AV_CHAN_SIDE_LEFT;
layer->ch_layout.u.map[n++].id = AV_CHAN_SIDE_RIGHT;
ch_layout.u.mask &= ~(AV_CH_SIDE_LEFT|AV_CH_SIDE_RIGHT);
} else if (ch_layout.u.mask & (AV_CH_BACK_LEFT|AV_CH_BACK_RIGHT)) {
layer->ch_layout.u.map[n++].id = AV_CHAN_BACK_LEFT;
layer->ch_layout.u.map[n++].id = AV_CHAN_BACK_RIGHT;
ch_layout.u.mask &= ~(AV_CH_BACK_LEFT|AV_CH_BACK_RIGHT);
} else if (ch_layout.u.mask & (AV_CH_TOP_FRONT_LEFT|AV_CH_TOP_FRONT_RIGHT)) {
layer->ch_layout.u.map[n++].id = AV_CHAN_TOP_FRONT_LEFT;
layer->ch_layout.u.map[n++].id = AV_CHAN_TOP_FRONT_RIGHT;
ch_layout.u.mask &= ~(AV_CH_TOP_FRONT_LEFT|AV_CH_TOP_FRONT_RIGHT);
} else if (ch_layout.u.mask & (AV_CH_TOP_SIDE_LEFT|AV_CH_TOP_SIDE_RIGHT)) {
layer->ch_layout.u.map[n++].id = AV_CHAN_TOP_SIDE_LEFT;
layer->ch_layout.u.map[n++].id = AV_CHAN_TOP_SIDE_RIGHT;
ch_layout.u.mask &= ~(AV_CH_TOP_SIDE_LEFT|AV_CH_TOP_SIDE_RIGHT);
} else if (ch_layout.u.mask & (AV_CH_TOP_BACK_LEFT|AV_CH_TOP_BACK_RIGHT)) {
layer->ch_layout.u.map[n++].id = AV_CHAN_TOP_BACK_LEFT;
layer->ch_layout.u.map[n++].id = AV_CHAN_TOP_BACK_RIGHT;
ch_layout.u.mask &= ~(AV_CH_TOP_BACK_LEFT|AV_CH_TOP_BACK_RIGHT);
}
}
substream_count -= audio_element->layers[i].coupled_substream_count;
while (substream_count--) {
if (ch_layout.u.mask & AV_CH_FRONT_CENTER) {
layer->ch_layout.u.map[n++].id = AV_CHAN_FRONT_CENTER;
ch_layout.u.mask &= ~AV_CH_FRONT_CENTER;
}
if (ch_layout.u.mask & AV_CH_LOW_FREQUENCY) {
layer->ch_layout.u.map[n++].id = AV_CHAN_LOW_FREQUENCY;
ch_layout.u.mask &= ~AV_CH_LOW_FREQUENCY;
}
}
ret = av_channel_layout_retype(&layer->ch_layout, AV_CHANNEL_ORDER_NATIVE, 0);
if (ret < 0 && ret != AVERROR(ENOSYS))
return ret;
} else // AV_CHANNEL_ORDER_UNSPEC
av_channel_layout_copy(&layer->ch_layout, &ch_layout);
}
return 0;
}
static int ambisonics_config(void *s, AVIOContext *pb,
IAMFAudioElement *audio_element,
const IAMFCodecConfig *codec_config)
{
AVIAMFLayer *layer;
unsigned ambisonics_mode;
int output_channel_count, substream_count, order;
int ret;
ambisonics_mode = ffio_read_leb(pb);
if (ambisonics_mode > 1)
return AVERROR_INVALIDDATA;
output_channel_count = avio_r8(pb); // C
substream_count = avio_r8(pb); // N
if (audio_element->nb_substreams != substream_count || output_channel_count == 0)
return AVERROR_INVALIDDATA;
order = floor(sqrt(output_channel_count - 1));
/* incomplete order - some harmonics are missing */
if ((order + 1) * (order + 1) != output_channel_count)
return AVERROR_INVALIDDATA;
audio_element->layers = av_mallocz(sizeof(*audio_element->layers));
if (!audio_element->layers)
return AVERROR(ENOMEM);
audio_element->nb_layers = 1;
audio_element->layers->substream_count = substream_count;
layer = av_iamf_audio_element_add_layer(audio_element->element);
if (!layer)
return AVERROR(ENOMEM);
layer->ambisonics_mode = ambisonics_mode;
if (ambisonics_mode == 0) {
for (int i = 0; i < substream_count; i++) {
IAMFSubStream *substream = &audio_element->substreams[i];
substream->codecpar->ch_layout = (AVChannelLayout)AV_CHANNEL_LAYOUT_MONO;
ret = update_extradata(substream->codecpar);
if (ret < 0)
return ret;
}
ret = av_channel_layout_custom_init(&layer->ch_layout, output_channel_count);
if (ret < 0)
return ret;
for (int i = 0; i < output_channel_count; i++)
layer->ch_layout.u.map[i].id = avio_r8(pb) + AV_CHAN_AMBISONIC_BASE;
ret = av_channel_layout_retype(&layer->ch_layout, AV_CHANNEL_ORDER_AMBISONIC, 0);
if (ret < 0 && ret != AVERROR(ENOSYS))
return ret;
} else {
int coupled_substream_count = avio_r8(pb); // M
int nb_demixing_matrix = substream_count + coupled_substream_count;
int demixing_matrix_size = nb_demixing_matrix * output_channel_count;
audio_element->layers->coupled_substream_count = coupled_substream_count;
layer->ch_layout = (AVChannelLayout){ .order = AV_CHANNEL_ORDER_AMBISONIC, .nb_channels = output_channel_count };
layer->demixing_matrix = av_malloc_array(demixing_matrix_size, sizeof(*layer->demixing_matrix));
if (!layer->demixing_matrix)
return AVERROR(ENOMEM);
for (int i = 0; i < demixing_matrix_size; i++)
layer->demixing_matrix[i] = av_make_q(sign_extend(avio_rb16(pb), 16), 1 << 8);
for (int i = 0; i < substream_count; i++) {
IAMFSubStream *substream = &audio_element->substreams[i];
substream->codecpar->ch_layout = coupled_substream_count-- > 0 ? (AVChannelLayout)AV_CHANNEL_LAYOUT_STEREO :
(AVChannelLayout)AV_CHANNEL_LAYOUT_MONO;
ret = update_extradata(substream->codecpar);
if (ret < 0)
return ret;
}
}
return 0;
}
static int param_parse(void *s, IAMFContext *c, AVIOContext *pb,
unsigned int type,
const IAMFAudioElement *audio_element,
AVIAMFParamDefinition **out_param_definition)
{
IAMFParamDefinition *param_definition = NULL;
AVIAMFParamDefinition *param;
unsigned int parameter_id, parameter_rate, mode;
unsigned int duration = 0, constant_subblock_duration = 0, nb_subblocks = 0;
unsigned int total_duration = 0;
size_t param_size;
parameter_id = ffio_read_leb(pb);
for (int i = 0; i < c->nb_param_definitions; i++)
if (c->param_definitions[i]->param->parameter_id == parameter_id) {
param_definition = c->param_definitions[i];
break;
}
parameter_rate = ffio_read_leb(pb);
mode = avio_r8(pb) >> 7;
if (mode == 0) {
duration = ffio_read_leb(pb);
if (!duration)
return AVERROR_INVALIDDATA;
constant_subblock_duration = ffio_read_leb(pb);
if (constant_subblock_duration == 0)
nb_subblocks = ffio_read_leb(pb);
else {
nb_subblocks = duration / constant_subblock_duration;
total_duration = duration;
}
}
param = av_iamf_param_definition_alloc(type, nb_subblocks, &param_size);
if (!param)
return AVERROR(ENOMEM);
for (int i = 0; i < nb_subblocks; i++) {
void *subblock = av_iamf_param_definition_get_subblock(param, i);
unsigned int subblock_duration = constant_subblock_duration;
if (constant_subblock_duration == 0) {
subblock_duration = ffio_read_leb(pb);
total_duration += subblock_duration;
} else if (i == nb_subblocks - 1)
subblock_duration = duration - i * constant_subblock_duration;
switch (type) {
case AV_IAMF_PARAMETER_DEFINITION_MIX_GAIN: {
AVIAMFMixGain *mix = subblock;
mix->subblock_duration = subblock_duration;
break;
}
case AV_IAMF_PARAMETER_DEFINITION_DEMIXING: {
AVIAMFDemixingInfo *demix = subblock;
demix->subblock_duration = subblock_duration;
// DefaultDemixingInfoParameterData
av_assert0(audio_element);
demix->dmixp_mode = avio_r8(pb) >> 5;
audio_element->element->default_w = avio_r8(pb) >> 4;
break;
}
case AV_IAMF_PARAMETER_DEFINITION_RECON_GAIN: {
AVIAMFReconGain *recon = subblock;
recon->subblock_duration = subblock_duration;
break;
}
default:
av_free(param);
return AVERROR_INVALIDDATA;
}
}
if (!mode && !constant_subblock_duration && total_duration != duration) {
av_log(s, AV_LOG_ERROR, "Invalid subblock durations in parameter_id %u\n", parameter_id);
av_free(param);
return AVERROR_INVALIDDATA;
}
param->parameter_id = parameter_id;
param->parameter_rate = parameter_rate;
param->duration = duration;
param->constant_subblock_duration = constant_subblock_duration;
param->nb_subblocks = nb_subblocks;
if (param_definition) {
if (param_definition->param_size != param_size || memcmp(param_definition->param, param, param_size)) {
2025-08-01 22:43:23 +02:00
av_log(s, AV_LOG_ERROR, "Inconsistent parameters for parameter_id %u\n", parameter_id);
av_free(param);
return AVERROR_INVALIDDATA;
}
} else {
IAMFParamDefinition **tmp = av_realloc_array(c->param_definitions, c->nb_param_definitions + 1,
sizeof(*c->param_definitions));
if (!tmp) {
av_free(param);
return AVERROR(ENOMEM);
}
c->param_definitions = tmp;
param_definition = av_mallocz(sizeof(*param_definition));
if (!param_definition) {
av_free(param);
return AVERROR(ENOMEM);
}
param_definition->param = param;
param_definition->mode = !mode;
param_definition->param_size = param_size;
param_definition->audio_element = audio_element;
c->param_definitions[c->nb_param_definitions++] = param_definition;
}
av_assert0(out_param_definition);
*out_param_definition = param;
return 0;
}
static int audio_element_obu(void *s, IAMFContext *c, AVIOContext *pb, int len)
{
const IAMFCodecConfig *codec_config;
AVIAMFAudioElement *element;
IAMFAudioElement **tmp, *audio_element = NULL;
FFIOContext b;
AVIOContext *pbc;
uint8_t *buf;
unsigned audio_element_id, nb_substreams, codec_config_id, num_parameters;
int audio_element_type, ret;
buf = av_malloc(len);
if (!buf)
return AVERROR(ENOMEM);
ret = ffio_read_size(pb, buf, len);
if (ret < 0)
goto fail;
ffio_init_context(&b, buf, len, 0, NULL, NULL, NULL, NULL);
pbc = &b.pub;
audio_element_id = ffio_read_leb(pbc);
for (int i = 0; i < c->nb_audio_elements; i++)
if (c->audio_elements[i]->audio_element_id == audio_element_id) {
av_log(s, AV_LOG_ERROR, "Duplicate audio_element_id %d\n", audio_element_id);
ret = AVERROR_INVALIDDATA;
goto fail;
}
audio_element_type = avio_r8(pbc) >> 5;
if (audio_element_type > AV_IAMF_AUDIO_ELEMENT_TYPE_SCENE) {
av_log(s, AV_LOG_DEBUG, "Unknown audio_element_type referenced in an audio element. Ignoring\n");
ret = 0;
goto fail;
}
codec_config_id = ffio_read_leb(pbc);
codec_config = ff_iamf_get_codec_config(c, codec_config_id);
if (!codec_config) {
2025-08-01 22:43:23 +02:00
av_log(s, AV_LOG_ERROR, "Non existent codec config id %d referenced in an audio element\n", codec_config_id);
ret = AVERROR_INVALIDDATA;
goto fail;
}
if (codec_config->codec_id == AV_CODEC_ID_NONE) {
av_log(s, AV_LOG_DEBUG, "Unknown codec id referenced in an audio element. Ignoring\n");
ret = 0;
goto fail;
}
tmp = av_realloc_array(c->audio_elements, c->nb_audio_elements + 1, sizeof(*c->audio_elements));
if (!tmp) {
ret = AVERROR(ENOMEM);
goto fail;
}
c->audio_elements = tmp;
audio_element = av_mallocz(sizeof(*audio_element));
if (!audio_element) {
ret = AVERROR(ENOMEM);
goto fail;
}
nb_substreams = ffio_read_leb(pbc);
audio_element->codec_config_id = codec_config_id;
audio_element->audio_element_id = audio_element_id;
audio_element->substreams = av_calloc(nb_substreams, sizeof(*audio_element->substreams));
if (!audio_element->substreams) {
ret = AVERROR(ENOMEM);
goto fail;
}
audio_element->nb_substreams = nb_substreams;
element = audio_element->element = av_iamf_audio_element_alloc();
if (!element) {
ret = AVERROR(ENOMEM);
goto fail;
}
audio_element->celement = element;
element->audio_element_type = audio_element_type;
for (int i = 0; i < audio_element->nb_substreams; i++) {
IAMFSubStream *substream = &audio_element->substreams[i];
substream->codecpar = avcodec_parameters_alloc();
if (!substream->codecpar) {
ret = AVERROR(ENOMEM);
goto fail;
}
substream->audio_substream_id = ffio_read_leb(pbc);
substream->codecpar->codec_type = AVMEDIA_TYPE_AUDIO;
substream->codecpar->codec_id = codec_config->codec_id;
substream->codecpar->frame_size = codec_config->nb_samples;
substream->codecpar->sample_rate = codec_config->sample_rate;
substream->codecpar->seek_preroll = -codec_config->audio_roll_distance * codec_config->nb_samples;
switch(substream->codecpar->codec_id) {
case AV_CODEC_ID_AAC:
case AV_CODEC_ID_FLAC:
case AV_CODEC_ID_OPUS:
substream->codecpar->extradata = av_malloc(codec_config->extradata_size + AV_INPUT_BUFFER_PADDING_SIZE);
if (!substream->codecpar->extradata) {
ret = AVERROR(ENOMEM);
goto fail;
}
memcpy(substream->codecpar->extradata, codec_config->extradata, codec_config->extradata_size);
memset(substream->codecpar->extradata + codec_config->extradata_size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
substream->codecpar->extradata_size = codec_config->extradata_size;
break;
}
}
num_parameters = ffio_read_leb(pbc);
if (num_parameters > 2 && audio_element_type == 0) {
av_log(s, AV_LOG_ERROR, "Audio Element parameter count %u is invalid"
" for Channel representations\n", num_parameters);
ret = AVERROR_INVALIDDATA;
goto fail;
}
if (num_parameters && audio_element_type != 0) {
av_log(s, AV_LOG_ERROR, "Audio Element parameter count %u is invalid"
" for Scene representations\n", num_parameters);
ret = AVERROR_INVALIDDATA;
goto fail;
}
for (int i = 0; i < num_parameters; i++) {
unsigned type;
type = ffio_read_leb(pbc);
if (type == AV_IAMF_PARAMETER_DEFINITION_MIX_GAIN)
ret = AVERROR_INVALIDDATA;
else if (type == AV_IAMF_PARAMETER_DEFINITION_DEMIXING) {
if (element->demixing_info) {
ret = AVERROR_INVALIDDATA;
goto fail;
}
ret = param_parse(s, c, pbc, type, audio_element, &element->demixing_info);
} else if (type == AV_IAMF_PARAMETER_DEFINITION_RECON_GAIN) {
if (element->recon_gain_info) {
ret = AVERROR_INVALIDDATA;
goto fail;
}
ret = param_parse(s, c, pbc, type, audio_element, &element->recon_gain_info);
} else {
unsigned param_definition_size = ffio_read_leb(pbc);
avio_skip(pbc, param_definition_size);
}
if (ret < 0)
goto fail;
}
if (audio_element_type == AV_IAMF_AUDIO_ELEMENT_TYPE_CHANNEL) {
ret = scalable_channel_layout_config(s, pbc, audio_element, codec_config);
if (ret < 0)
goto fail;
} else if (audio_element_type == AV_IAMF_AUDIO_ELEMENT_TYPE_SCENE) {
ret = ambisonics_config(s, pbc, audio_element, codec_config);
if (ret < 0)
goto fail;
} else {
av_assert0(0);
}
c->audio_elements[c->nb_audio_elements++] = audio_element;
len -= avio_tell(pbc);
if (len)
av_log(s, AV_LOG_WARNING, "Underread in audio_element_obu. %d bytes left at the end\n", len);
ret = 0;
fail:
av_free(buf);
if (ret < 0)
ff_iamf_free_audio_element(&audio_element);
return ret;
}
static int label_string(AVIOContext *pb, char **label)
{
uint8_t buf[128];
avio_get_str(pb, sizeof(buf), buf, sizeof(buf));
if (pb->error)
return pb->error;
if (pb->eof_reached)
return AVERROR_INVALIDDATA;
*label = av_strdup(buf);
if (!*label)
return AVERROR(ENOMEM);
return 0;
}
static int mix_presentation_obu(void *s, IAMFContext *c, AVIOContext *pb, int len)
{
AVIAMFMixPresentation *mix;
IAMFMixPresentation **tmp, *mix_presentation = NULL;
FFIOContext b;
AVIOContext *pbc;
uint8_t *buf;
unsigned nb_submixes, mix_presentation_id;
int ret;
buf = av_malloc(len);
if (!buf)
return AVERROR(ENOMEM);
ret = ffio_read_size(pb, buf, len);
if (ret < 0)
goto fail;
ffio_init_context(&b, buf, len, 0, NULL, NULL, NULL, NULL);
pbc = &b.pub;
mix_presentation_id = ffio_read_leb(pbc);
for (int i = 0; i < c->nb_mix_presentations; i++)
if (c->mix_presentations[i]->mix_presentation_id == mix_presentation_id) {
av_log(s, AV_LOG_ERROR, "Duplicate mix_presentation_id %d\n", mix_presentation_id);
ret = AVERROR_INVALIDDATA;
goto fail;
}
tmp = av_realloc_array(c->mix_presentations, c->nb_mix_presentations + 1, sizeof(*c->mix_presentations));
if (!tmp) {
ret = AVERROR(ENOMEM);
goto fail;
}
c->mix_presentations = tmp;
mix_presentation = av_mallocz(sizeof(*mix_presentation));
if (!mix_presentation) {
ret = AVERROR(ENOMEM);
goto fail;
}
mix_presentation->mix_presentation_id = mix_presentation_id;
mix = mix_presentation->mix = av_iamf_mix_presentation_alloc();
if (!mix) {
ret = AVERROR(ENOMEM);
goto fail;
}
mix_presentation->cmix = mix;
mix_presentation->count_label = ffio_read_leb(pbc);
mix_presentation->language_label = av_calloc(mix_presentation->count_label,
sizeof(*mix_presentation->language_label));
if (!mix_presentation->language_label) {
mix_presentation->count_label = 0;
ret = AVERROR(ENOMEM);
goto fail;
}
for (int i = 0; i < mix_presentation->count_label; i++) {
ret = label_string(pbc, &mix_presentation->language_label[i]);
if (ret < 0)
goto fail;
}
for (int i = 0; i < mix_presentation->count_label; i++) {
char *annotation = NULL;
ret = label_string(pbc, &annotation);
if (ret < 0)
goto fail;
ret = av_dict_set(&mix->annotations, mix_presentation->language_label[i], annotation,
AV_DICT_DONT_STRDUP_VAL | AV_DICT_DONT_OVERWRITE);
if (ret < 0)
goto fail;
}
nb_submixes = ffio_read_leb(pbc);
for (int i = 0; i < nb_submixes; i++) {
AVIAMFSubmix *sub_mix;
unsigned nb_elements, nb_layouts;
sub_mix = av_iamf_mix_presentation_add_submix(mix);
if (!sub_mix) {
ret = AVERROR(ENOMEM);
goto fail;
}
nb_elements = ffio_read_leb(pbc);
for (int j = 0; j < nb_elements; j++) {
AVIAMFSubmixElement *submix_element;
IAMFAudioElement *audio_element = NULL;
unsigned int rendering_config_extension_size;
submix_element = av_iamf_submix_add_element(sub_mix);
if (!submix_element) {
ret = AVERROR(ENOMEM);
goto fail;
}
submix_element->audio_element_id = ffio_read_leb(pbc);
for (int k = 0; k < c->nb_audio_elements; k++)
if (c->audio_elements[k]->audio_element_id == submix_element->audio_element_id) {
audio_element = c->audio_elements[k];
break;
}
if (!audio_element) {
av_log(s, AV_LOG_ERROR, "Invalid Audio Element with id %u referenced by Mix Parameters %u\n",
submix_element->audio_element_id, mix_presentation_id);
ret = AVERROR_INVALIDDATA;
goto fail;
}
for (int k = 0; k < mix_presentation->count_label; k++) {
char *annotation = NULL;
ret = label_string(pbc, &annotation);
if (ret < 0)
goto fail;
ret = av_dict_set(&submix_element->annotations, mix_presentation->language_label[k], annotation,
AV_DICT_DONT_STRDUP_VAL | AV_DICT_DONT_OVERWRITE);
if (ret < 0)
goto fail;
}
submix_element->headphones_rendering_mode = avio_r8(pbc) >> 6;
rendering_config_extension_size = ffio_read_leb(pbc);
avio_skip(pbc, rendering_config_extension_size);
ret = param_parse(s, c, pbc, AV_IAMF_PARAMETER_DEFINITION_MIX_GAIN,
NULL,
&submix_element->element_mix_config);
if (ret < 0)
goto fail;
submix_element->default_mix_gain = av_make_q(sign_extend(avio_rb16(pbc), 16), 1 << 8);
}
ret = param_parse(s, c, pbc, AV_IAMF_PARAMETER_DEFINITION_MIX_GAIN, NULL, &sub_mix->output_mix_config);
if (ret < 0)
goto fail;
sub_mix->default_mix_gain = av_make_q(sign_extend(avio_rb16(pbc), 16), 1 << 8);
nb_layouts = ffio_read_leb(pbc);
for (int j = 0; j < nb_layouts; j++) {
AVIAMFSubmixLayout *submix_layout;
int info_type;
int byte = avio_r8(pbc);
submix_layout = av_iamf_submix_add_layout(sub_mix);
if (!submix_layout) {
ret = AVERROR(ENOMEM);
goto fail;
}
submix_layout->layout_type = byte >> 6;
if (submix_layout->layout_type < AV_IAMF_SUBMIX_LAYOUT_TYPE_LOUDSPEAKERS ||
submix_layout->layout_type > AV_IAMF_SUBMIX_LAYOUT_TYPE_BINAURAL) {
av_log(s, AV_LOG_ERROR, "Invalid Layout type %u in a submix from Mix Presentation %u\n",
submix_layout->layout_type, mix_presentation_id);
ret = AVERROR_INVALIDDATA;
goto fail;
}
if (submix_layout->layout_type == 2) {
int sound_system;
sound_system = (byte >> 2) & 0xF;
if (sound_system >= FF_ARRAY_ELEMS(ff_iamf_sound_system_map)) {
ret = AVERROR_INVALIDDATA;
goto fail;
}
av_channel_layout_copy(&submix_layout->sound_system, &ff_iamf_sound_system_map[sound_system].layout);
} else
submix_layout->sound_system = (AVChannelLayout)AV_CHANNEL_LAYOUT_BINAURAL;
info_type = avio_r8(pbc);
submix_layout->integrated_loudness = av_make_q(sign_extend(avio_rb16(pbc), 16), 1 << 8);
submix_layout->digital_peak = av_make_q(sign_extend(avio_rb16(pbc), 16), 1 << 8);
if (info_type & 1)
submix_layout->true_peak = av_make_q(sign_extend(avio_rb16(pbc), 16), 1 << 8);
if (info_type & 2) {
unsigned int num_anchored_loudness = avio_r8(pbc);
for (int k = 0; k < num_anchored_loudness; k++) {
unsigned int anchor_element = avio_r8(pbc);
AVRational anchored_loudness = av_make_q(sign_extend(avio_rb16(pbc), 16), 1 << 8);
if (anchor_element == IAMF_ANCHOR_ELEMENT_DIALOGUE)
submix_layout->dialogue_anchored_loudness = anchored_loudness;
else if (anchor_element <= IAMF_ANCHOR_ELEMENT_ALBUM)
submix_layout->album_anchored_loudness = anchored_loudness;
else
av_log(s, AV_LOG_DEBUG, "Unknown anchor_element. Ignoring\n");
}
}
if (info_type & 0xFC) {
unsigned int info_type_size = ffio_read_leb(pbc);
avio_skip(pbc, info_type_size);
}
}
}
c->mix_presentations[c->nb_mix_presentations++] = mix_presentation;
len -= avio_tell(pbc);
if (len)
av_log(s, AV_LOG_WARNING, "Underread in mix_presentation_obu. %d bytes left at the end\n", len);
ret = 0;
fail:
av_free(buf);
if (ret < 0)
ff_iamf_free_mix_presentation(&mix_presentation);
return ret;
}
int ff_iamf_parse_obu_header(const uint8_t *buf, int buf_size,
unsigned *obu_size, int *start_pos, enum IAMF_OBU_Type *type,
unsigned *skip_samples, unsigned *discard_padding)
{
GetBitContext gb;
int ret, extension_flag, trimming, start;
unsigned skip = 0, discard = 0;
unsigned size;
ret = init_get_bits8(&gb, buf, FFMIN(buf_size, MAX_IAMF_OBU_HEADER_SIZE));
if (ret < 0)
return ret;
*type = get_bits(&gb, 5);
/*redundant =*/ get_bits1(&gb);
trimming = get_bits1(&gb);
extension_flag = get_bits1(&gb);
*obu_size = get_leb(&gb);
if (*obu_size > INT_MAX)
return AVERROR_INVALIDDATA;
start = get_bits_count(&gb) / 8;
if (trimming) {
discard = get_leb(&gb); // num_samples_to_trim_at_end
skip = get_leb(&gb); // num_samples_to_trim_at_start
}
if (skip_samples)
*skip_samples = skip;
if (discard_padding)
*discard_padding = discard;
if (extension_flag) {
unsigned int extension_bytes;
extension_bytes = get_leb(&gb);
if (extension_bytes > INT_MAX / 8)
return AVERROR_INVALIDDATA;
skip_bits_long(&gb, extension_bytes * 8);
}
if (get_bits_left(&gb) < 0)
return AVERROR_INVALIDDATA;
size = *obu_size + start;
if (size > INT_MAX)
return AVERROR_INVALIDDATA;
*obu_size -= get_bits_count(&gb) / 8 - start;
*start_pos = size - *obu_size;
return size;
}
int ff_iamfdec_read_descriptors(IAMFContext *c, AVIOContext *pb,
int max_size, void *log_ctx)
{
uint8_t header[MAX_IAMF_OBU_HEADER_SIZE + AV_INPUT_BUFFER_PADDING_SIZE];
int ret;
while (1) {
unsigned obu_size;
enum IAMF_OBU_Type type;
int start_pos, len, size;
if ((ret = ffio_ensure_seekback(pb, FFMIN(MAX_IAMF_OBU_HEADER_SIZE, max_size))) < 0)
return ret;
size = avio_read(pb, header, FFMIN(MAX_IAMF_OBU_HEADER_SIZE, max_size));
if (size < 0)
return size;
memset(header + size, 0, AV_INPUT_BUFFER_PADDING_SIZE);
len = ff_iamf_parse_obu_header(header, size, &obu_size, &start_pos, &type, NULL, NULL);
if (len < 0 || obu_size > max_size) {
av_log(log_ctx, AV_LOG_ERROR, "Failed to read obu header\n");
avio_seek(pb, -size, SEEK_CUR);
return len;
}
if (type >= IAMF_OBU_IA_PARAMETER_BLOCK && type < IAMF_OBU_IA_SEQUENCE_HEADER) {
avio_seek(pb, -size, SEEK_CUR);
break;
}
avio_seek(pb, -(size - start_pos), SEEK_CUR);
switch (type) {
case IAMF_OBU_IA_CODEC_CONFIG:
ret = codec_config_obu(log_ctx, c, pb, obu_size);
break;
case IAMF_OBU_IA_AUDIO_ELEMENT:
ret = audio_element_obu(log_ctx, c, pb, obu_size);
break;
case IAMF_OBU_IA_MIX_PRESENTATION:
ret = mix_presentation_obu(log_ctx, c, pb, obu_size);
break;
default: {
int64_t offset = avio_skip(pb, obu_size);
if (offset < 0)
ret = offset;
break;
}
}
if (ret < 0) {
av_log(log_ctx, AV_LOG_ERROR, "Failed to read obu type %d\n", type);
return ret;
}
max_size -= obu_size + start_pos;
if (max_size < 0)
return AVERROR_INVALIDDATA;
if (!max_size)
break;
}
return 0;
}