mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-01-24 13:56:33 +02:00
avformat/assdec: UTF-16 support
Use the UTF-16 BOM to detect UTF-16 encoding. Convert the file contents to UTF-8 on the fly using FFTextReader, which acts as converting wrapper around AVIOContext. It also can work on a static buffer, needed for format probing. The FFTextReader wrapper now also takes care of skipping the UTF-8 BOM. Fix Ticket #3496.
This commit is contained in:
parent
dcb29d37d4
commit
3e8426170c
@ -33,10 +33,13 @@ typedef struct ASSContext {
|
||||
|
||||
static int ass_probe(AVProbeData *p)
|
||||
{
|
||||
const char *header = "[Script Info]";
|
||||
char buf[13];
|
||||
FFTextReader tr;
|
||||
ff_text_init_buf(&tr, p->buf, p->buf_size);
|
||||
|
||||
if (!memcmp(p->buf, header, strlen(header)) ||
|
||||
!memcmp(p->buf + 3, header, strlen(header)))
|
||||
ff_text_read(&tr, buf, sizeof(buf));
|
||||
|
||||
if (!memcmp(buf, "[Script Info]", 13))
|
||||
return AVPROBE_SCORE_MAX;
|
||||
|
||||
return 0;
|
||||
@ -66,13 +69,13 @@ static int read_ts(const uint8_t *p, int64_t *start, int *duration)
|
||||
return -1;
|
||||
}
|
||||
|
||||
static int64_t get_line(AVBPrint *buf, AVIOContext *pb)
|
||||
static int64_t get_line(AVBPrint *buf, FFTextReader *tr)
|
||||
{
|
||||
int64_t pos = avio_tell(pb);
|
||||
int64_t pos = ff_text_pos(tr);
|
||||
|
||||
av_bprint_clear(buf);
|
||||
for (;;) {
|
||||
char c = avio_r8(pb);
|
||||
char c = ff_text_r8(tr);
|
||||
if (!c)
|
||||
break;
|
||||
av_bprint_chars(buf, c, 1);
|
||||
@ -88,6 +91,8 @@ static int ass_read_header(AVFormatContext *s)
|
||||
AVBPrint header, line;
|
||||
int header_remaining, res = 0;
|
||||
AVStream *st;
|
||||
FFTextReader tr;
|
||||
ff_text_init_avio(&tr, s->pb);
|
||||
|
||||
st = avformat_new_stream(s, NULL);
|
||||
if (!st)
|
||||
@ -102,7 +107,7 @@ static int ass_read_header(AVFormatContext *s)
|
||||
av_bprint_init(&line, 0, AV_BPRINT_SIZE_UNLIMITED);
|
||||
|
||||
for (;;) {
|
||||
int64_t pos = get_line(&line, s->pb);
|
||||
int64_t pos = get_line(&line, &tr);
|
||||
|
||||
if (!line.str[0]) // EOF
|
||||
break;
|
||||
|
@ -20,9 +20,72 @@
|
||||
|
||||
#include "avformat.h"
|
||||
#include "subtitles.h"
|
||||
#include "avio_internal.h"
|
||||
#include "libavutil/avassert.h"
|
||||
#include "libavutil/avstring.h"
|
||||
|
||||
void ff_text_init_avio(FFTextReader *r, AVIOContext *pb)
|
||||
{
|
||||
int i;
|
||||
r->pb = pb;
|
||||
r->buf_pos = r->buf_len = 0;
|
||||
r->type = FF_UTF_8;
|
||||
for (i = 0; i < 2; i++)
|
||||
r->buf[r->buf_len++] = avio_r8(r->pb);
|
||||
if (strncmp("\xFF\xFE", r->buf, 2) == 0) {
|
||||
r->type = FF_UTF16LE;
|
||||
r->buf_pos += 2;
|
||||
} else if (strncmp("\xFE\xFF", r->buf, 2) == 0) {
|
||||
r->type = FF_UTF16BE;
|
||||
r->buf_pos += 2;
|
||||
} else {
|
||||
r->buf[r->buf_len++] = avio_r8(r->pb);
|
||||
if (strncmp("\xEF\xBB\xBF", r->buf, 3) == 0) {
|
||||
// UTF8
|
||||
r->buf_pos += 3;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ff_text_init_buf(FFTextReader *r, void *buf, size_t size)
|
||||
{
|
||||
memset(&r->buf_pb, 0, sizeof(r->buf_pb));
|
||||
ffio_init_context(&r->buf_pb, buf, size, 0, NULL, NULL, NULL, NULL);
|
||||
ff_text_init_avio(r, &r->buf_pb);
|
||||
}
|
||||
|
||||
int64_t ff_text_pos(FFTextReader *r)
|
||||
{
|
||||
return avio_tell(r->pb) - r->buf_len + r->buf_pos;
|
||||
}
|
||||
|
||||
int ff_text_r8(FFTextReader *r)
|
||||
{
|
||||
uint32_t val;
|
||||
uint8_t tmp;
|
||||
if (r->buf_pos < r->buf_len)
|
||||
return r->buf[r->buf_pos++];
|
||||
if (r->type == FF_UTF16LE) {
|
||||
GET_UTF16(val, avio_rl16(r->pb), return 0;)
|
||||
} else if (r->type == FF_UTF16BE) {
|
||||
GET_UTF16(val, avio_rb16(r->pb), return 0;)
|
||||
} else {
|
||||
return avio_r8(r->pb);
|
||||
}
|
||||
if (!val)
|
||||
return 0;
|
||||
r->buf_pos = 0;
|
||||
r->buf_len = 0;
|
||||
PUT_UTF8(val, tmp, r->buf[r->buf_len++] = tmp;)
|
||||
return r->buf[r->buf_pos++]; // buf_len is at least 1
|
||||
}
|
||||
|
||||
void ff_text_read(FFTextReader *r, char *buf, size_t size)
|
||||
{
|
||||
for ( ; size > 0; size--)
|
||||
*buf++ = ff_text_r8(r);
|
||||
}
|
||||
|
||||
AVPacket *ff_subtitles_queue_insert(FFDemuxSubtitlesQueue *q,
|
||||
const uint8_t *event, int len, int merge)
|
||||
{
|
||||
|
@ -30,6 +30,62 @@ enum sub_sort {
|
||||
SUB_SORT_POS_TS, ///< sort by position, then timestamps
|
||||
};
|
||||
|
||||
enum ff_utf_type {
|
||||
FF_UTF_8, // or other 8 bit encodings
|
||||
FF_UTF16LE,
|
||||
FF_UTF16BE,
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
int type;
|
||||
AVIOContext *pb;
|
||||
unsigned char buf[8];
|
||||
int buf_pos, buf_len;
|
||||
AVIOContext buf_pb;
|
||||
} FFTextReader;
|
||||
|
||||
/**
|
||||
* Initialize the FFTextReader from the given AVIOContext. This function will
|
||||
* read some bytes from pb, and test for UTF-8 or UTF-16 BOMs. Further accesses
|
||||
* to FFTextReader will read more data from pb.
|
||||
*
|
||||
* The purpose of FFTextReader is to transparently convert read data to UTF-8
|
||||
* if the stream had a UTF-16 BOM.
|
||||
*
|
||||
* @param r object which will be initialized
|
||||
* @param pb stream to read from (referenced as long as FFTextReader is in use)
|
||||
*/
|
||||
void ff_text_init_avio(FFTextReader *r, AVIOContext *pb);
|
||||
|
||||
/**
|
||||
* Similar to ff_text_init_avio(), but sets it up to read from a bounded buffer.
|
||||
*
|
||||
* @param r object which will be initialized
|
||||
* @param buf buffer to read from (referenced as long as FFTextReader is in use)
|
||||
* @param size size of buf
|
||||
*/
|
||||
void ff_text_init_buf(FFTextReader *r, void *buf, size_t size);
|
||||
|
||||
/**
|
||||
* Return the byte position of the next byte returned by ff_text_r8(). For
|
||||
* UTF-16 source streams, this will return the original position, but it will
|
||||
* be incorrect if a codepoint was only partially read with ff_text_r8().
|
||||
*/
|
||||
int64_t ff_text_pos(FFTextReader *r);
|
||||
|
||||
/**
|
||||
* Return the next byte. The return value is always 0 - 255. Returns 0 on EOF.
|
||||
* If the source stream is UTF-16, this reads from the stream converted to
|
||||
* UTF-8. On invalid UTF-16, 0 is returned.
|
||||
*/
|
||||
int ff_text_r8(FFTextReader *r);
|
||||
|
||||
/**
|
||||
* Read the given number of bytes (in UTF-8). On error or EOF, \0 bytes are
|
||||
* written.
|
||||
*/
|
||||
void ff_text_read(FFTextReader *r, char *buf, size_t size);
|
||||
|
||||
typedef struct {
|
||||
AVPacket *subs; ///< array of subtitles packets
|
||||
int nb_subs; ///< number of subtitles packets
|
||||
|
Loading…
x
Reference in New Issue
Block a user