diff --git a/libavcodec/htmlsubtitles.c b/libavcodec/htmlsubtitles.c index e1fb7bc3cf..baed28450e 100644 --- a/libavcodec/htmlsubtitles.c +++ b/libavcodec/htmlsubtitles.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2010 Aurelien Jacobs + * Copyright (c) 2017 Clément Bœsch * * This file is part of FFmpeg. * @@ -18,6 +19,7 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +#include "libavutil/avassert.h" #include "libavutil/avstring.h" #include "libavutil/common.h" #include "libavutil/parseutils.h" @@ -31,19 +33,6 @@ static int html_color_parse(void *log_ctx, const char *str) return rgba[0] | rgba[1] << 8 | rgba[2] << 16; } -enum { - PARAM_UNKNOWN = -1, - PARAM_SIZE, - PARAM_COLOR, - PARAM_FACE, - PARAM_NUMBER -}; - -typedef struct SrtStack { - char tag[128]; - char param[PARAM_NUMBER][128]; -} SrtStack; - static void rstrip_spaces_buf(AVBPrint *buf) { if (av_bprint_is_complete(buf)) @@ -75,17 +64,50 @@ static void handle_open_brace(AVBPrint *dst, const char **inp, int *an, int *clo av_bprint_chars(dst, *in, 1); } +struct font_tag { + char face[128]; + int size; + uint32_t color; +}; + +/* + * The general politic of the convert is to mask unsupported tags or formatting + * errors (but still alert the user/subtitles writer with an error/warning) + * without dropping any actual text content for the final user. + */ int ff_htmlmarkup_to_ass(void *log_ctx, AVBPrint *dst, const char *in) { - char *param, buffer[128], tmp[128]; - int len, tag_close, sptr = 1, line_start = 1, an = 0, end = 0; - SrtStack stack[16]; + char *param, buffer[128]; + int len, tag_close, sptr = 0, line_start = 1, an = 0, end = 0; int closing_brace_missing = 0; + int i, likely_a_tag; - stack[0].tag[0] = 0; - strcpy(stack[0].param[PARAM_SIZE], "{\\fs}"); - strcpy(stack[0].param[PARAM_COLOR], "{\\c}"); - strcpy(stack[0].param[PARAM_FACE], "{\\fn}"); + /* + * state stack is only present for fonts since they are the only tags where + * the state is not binary. Here is a typical use case: + * + * + * red 10 + * RED AND BIG + * red 10 again + * + * + * On the other hand, using the state system for all the tags should be + * avoided because it breaks wrongly nested tags such as: + * + * foo bar bla + * + * We don't want to break here; instead, we will treat all these tags as + * binary state markers. Basically, "" will activate bold, and "" + * will deactivate it, whatever the current state. + * + * This will also prevents cases where we have a random closing tag + * remaining after the opening one was dropped. Yes, this happens and we + * still don't want to print a "" at the end of the dialog event. + */ + struct font_tag stack[16]; + + memset(&stack[0], 0, sizeof(stack[0])); for (; !end && *in; in++) { switch (*in) { @@ -108,82 +130,127 @@ int ff_htmlmarkup_to_ass(void *log_ctx, AVBPrint *dst, const char *in) handle_open_brace(dst, &in, &an, &closing_brace_missing); break; case '<': + /* + * "<<" are likely latin guillemets in ASCII or some kind of random + * style effect; see sub/badsyntax.srt in the FATE samples + * directory for real test cases. + */ + + likely_a_tag = 1; + for (i = 0; in[1] == '<'; i++) { + av_bprint_chars(dst, '<', 1); + likely_a_tag = 0; + in++; + } + tag_close = in[1] == '/'; + if (tag_close) + likely_a_tag = 1; + + av_assert0(in[0] == '<'); + len = 0; + if (sscanf(in+tag_close+1, "%127[^<>]>%n", buffer, &len) >= 1 && len > 0) { + const int skip = len + tag_close; const char *tagname = buffer; - while (*tagname == ' ') + while (*tagname == ' ') { + likely_a_tag = 0; tagname++; + } if ((param = strchr(tagname, ' '))) *param++ = 0; - if ((!tag_close && sptr < FF_ARRAY_ELEMS(stack) && *tagname) || - ( tag_close && sptr > 0 && !av_strcasecmp(stack[sptr-1].tag, tagname))) { - int i, j, unknown = 0; - in += len + tag_close; - if (!tag_close) - memset(stack+sptr, 0, sizeof(*stack)); - if (!av_strcasecmp(tagname, "font")) { - if (tag_close) { - for (i=PARAM_NUMBER-1; i>=0; i--) - if (stack[sptr-1].param[i][0]) - for (j=sptr-2; j>=0; j--) - if (stack[j].param[i][0]) { - av_bprintf(dst, "%s", stack[j].param[i]); - break; - } - } else { + + /* Check if this is likely a tag */ +#define LIKELY_A_TAG_CHAR(x) (((x) >= '0' && (x) <= '9') || \ + ((x) >= 'a' && (x) <= 'z') || \ + ((x) >= 'A' && (x) <= 'Z') || \ + (x) == '_') + for (i = 0; tagname[i]; i++) { + if (!LIKELY_A_TAG_CHAR(tagname[i])) { + likely_a_tag = 0; + break; + } + } + + // TODO: reindent + + if (!av_strcasecmp(tagname, "font")) { + if (tag_close && sptr > 0) { + struct font_tag *cur_tag = &stack[sptr--]; + struct font_tag *last_tag = &stack[sptr]; + + if (cur_tag->size) { + if (!last_tag->size) + av_bprintf(dst, "{\\fs}"); + else if (last_tag->size != cur_tag->size) + av_bprintf(dst, "{\\fs%d}", last_tag->size); + } + + if (cur_tag->color & 0xff000000) { + if (!(last_tag->color & 0xff000000)) + av_bprintf(dst, "{\\c}"); + else if (last_tag->color != cur_tag->color) + av_bprintf(dst, "{\\c&H%X&}", last_tag->color & 0xffffff); + } + + if (cur_tag->face[0]) { + if (!last_tag->face[0]) + av_bprintf(dst, "{\\fn}"); + else if (strcmp(last_tag->face, cur_tag->face)) + av_bprintf(dst, "{\\fn%s}", last_tag->face); + } + } else if (!tag_close && sptr < FF_ARRAY_ELEMS(stack) - 1) { + struct font_tag *new_tag = &stack[sptr + 1]; + + *new_tag = stack[sptr++]; + while (param) { if (!av_strncasecmp(param, "size=", 5)) { - unsigned font_size; param += 5 + (param[5] == '"'); - if (sscanf(param, "%u", &font_size) == 1) { - snprintf(stack[sptr].param[PARAM_SIZE], - sizeof(stack[0].param[PARAM_SIZE]), - "{\\fs%u}", font_size); - } + if (sscanf(param, "%u", &new_tag->size) == 1) + av_bprintf(dst, "{\\fs%u}", new_tag->size); } else if (!av_strncasecmp(param, "color=", 6)) { + int color; param += 6 + (param[6] == '"'); - snprintf(stack[sptr].param[PARAM_COLOR], - sizeof(stack[0].param[PARAM_COLOR]), - "{\\c&H%X&}", - html_color_parse(log_ctx, param)); + color = html_color_parse(log_ctx, param); + if (color >= 0) { + new_tag->color = 0xff000000 | color; + av_bprintf(dst, "{\\c&H%X&}", new_tag->color & 0xffffff); + } } else if (!av_strncasecmp(param, "face=", 5)) { param += 5 + (param[5] == '"'); len = strcspn(param, param[-1] == '"' ? "\"" :" "); - av_strlcpy(tmp, param, - FFMIN(sizeof(tmp), len+1)); + av_strlcpy(new_tag->face, param, + FFMIN(sizeof(new_tag->face), len+1)); param += len; - snprintf(stack[sptr].param[PARAM_FACE], - sizeof(stack[0].param[PARAM_FACE]), - "{\\fn%s}", tmp); + av_bprintf(dst, "{\\fn%s}", new_tag->face); } if ((param = strchr(param, ' '))) param++; } - for (i=0; i", tagname); - } - if (tag_close) { - sptr--; - } else if (unknown && !strstr(in, tmp)) { - in -= len + tag_close; - av_bprint_chars(dst, *in, 1); - } else - av_strlcpy(stack[sptr++].tag, tagname, - sizeof(stack[0].tag)); - break; + av_bprint_chars(dst, '<', 1); } + } else { + av_bprint_chars(dst, *in, 1); } + break; default: av_bprint_chars(dst, *in, 1); break; diff --git a/tests/ref/fate/sub-sami2 b/tests/ref/fate/sub-sami2 index 9e9c80870d..64656f06cb 100644 --- a/tests/ref/fate/sub-sami2 +++ b/tests/ref/fate/sub-sami2 @@ -19,7 +19,7 @@ Dialogue: 0,0:00:17.67,0:00:21.72,Default,,0,0,0,,\N{\c&HCBC0FF&}{\fs6}It's{\fs} Dialogue: 0,0:00:21.84,0:00:21.84,Default,,0,0,0,,\N{\u1}매력적인 세계에서\N이 모든 것이 펼쳐집니다{\u1} Dialogue: 0,0:00:21.84,0:00:23.58,Default,,0,0,0,,\N{\i1}{\c&H9966CC&}of{\c}{\c&HC2A3E0&} what's{\c} {\c&HE0D1F0&}right{\c} {\c&HFCFAFE&}and{\c} wrong.{\i0} Dialogue: 0,0:00:23.69,0:00:23.69,Default,,0,0,0,,\N{\i1}이 주제를 심오한 철학으로\N담아내고 있어요{\i0} -Dialogue: 0,0:00:23.69,0:00:25.67,Default,,0,0,0,,\N{\fs20}{\c&HFF0000&}{\s1}It{\s0}{\c}{\fs} has {\fs15}{\c&HFFFF00&}a{\c}{\fs} great {\fs16}{\c&HFFCC00&}philosophy{\c}{\fs} about it. +Dialogue: 0,0:00:23.69,0:00:25.67,Default,,0,0,0,,\N{\c&HFF0000&}{\fs20}{\s1}It{\s0}{\fs}{\c} has {\c&HFFFF00&}{\fs15}a{\fs}{\c} great {\c&HFFCC00&}{\fs16}philosophy{\fs}{\c} about it. Dialogue: 0,0:00:40.22,0:00:40.22,Default,,0,0,0,,\N{\s1}"왕좌의 게임"은 웨스테로스라는 가상왕국의\N권력 분쟁 이야기입니다{\s0} Dialogue: 0,0:00:40.22,0:00:47.94,Default,,0,0,0,,\N{\c&HA5FF&}{\fs26}"Game of Thrones"{\fs}{\c} {\c&H2A2AA5&}{\b1}is{\b0}{\c}{\c&HFFFF&}{\fs24}{\i1} about{\i0}{\fs}{\c} {\c&H336699&}{\fs14}power{\fs}{\c}{\c&HFF&} struggles{\c}\N{\c&HA5FF&}{\fs8}in a fantasy{\fs}{\c&HCBC0FF&} kingdom{\c&HA5FF&}, called {\fs6}Westeros.{\fs}{\c} Dialogue: 0,0:00:48.06,0:00:48.06,Default,,0,0,0,,\N철의 왕좌를 둘러싼\N권력 분쟁이죠 @@ -51,7 +51,7 @@ Dialogue: 0,0:01:28.98,0:01:33.89,Default,,0,0,0,,\N{\u1}this common threat, tha Dialogue: 0,0:01:34.01,0:01:35.24,Default,,0,0,0,,\Npursuing their own interests. Dialogue: 0,0:01:35.36,0:01:35.36,Default,,0,0,0,,\N한편, 일곱 왕국의 밖에서는\N두 개의 거대한 위협이 부상합니다 Dialogue: 0,0:01:35.36,0:01:40.23,Default,,0,0,0,,\N{\fs30}And meanwhile, outside the Seven\NKingdoms, two great threats arising.{\fs} -Dialogue: 0,0:01:40.35,0:01:40.35,Default,,0,0,0,,\N하나는 바다 건너\N타가리엔 일족 유배자들이며 +Dialogue: 0,0:01:40.35,0:01:40.35,Default,,0,0,0,,\N하나는 바다 건너\N타가리엔 일족 유배자들이며 Dialogue: 0,0:01:40.35,0:01:44.06,Default,,0,0,0,,\NOne across the sea, in the exile\NTargaryen siblings, Dialogue: 0,0:01:44.17,0:01:44.17,Default,,0,0,0,,\N또 하나는 일곱 왕국의\N국경이 자리잡은 Dialogue: 0,0:01:44.17,0:01:47.39,Default,,0,0,0,,\Nand another far to the north,\Nbeyond the Wall, diff --git a/tests/ref/fate/sub-srt b/tests/ref/fate/sub-srt index 40b20cde90..fd60290929 100644 --- a/tests/ref/fate/sub-srt +++ b/tests/ref/fate/sub-srt @@ -14,10 +14,10 @@ Dialogue: 0,0:00:00.00,0:00:00.00,Default,,0,0,0,,Don't show this text it may be Dialogue: 0,0:00:01.50,0:00:04.50,Default,,0,0,0,,SubRip subtitles capability tester 1.3o by ale5000\N{\b1}{\i1}Use VLC 1.1 or higher as reference for most things and MPC Home Cinema for others{\i0}{\b0}\N{\c&HFF0000&}This text should be blue{\c}\N{\c&HFF&}This text should be red{\c}\N{\c&H0&}This text should be black{\c}\N{\fnWebdings}If you see this with the normal font, the player don't (fully) support font face{\fn} Dialogue: 0,0:00:04.50,0:00:04.50,Default,,0,0,0,,Hidden Dialogue: 0,0:00:04.50,0:00:07.50,Default,,0,0,0,,{\fs8}This text should be small{\fs}\NThis text should be normal\N{\fs35}This text should be big{\fs} -Dialogue: 0,0:00:07.50,0:00:11.50,Default,,0,0,0,,This should be an E with an accent: È\N日本語\N{\fs30}{\b1}{\i1}{\u1}This text should be bold, italics and underline{\u0}{\i0}{\b0}{\fs}\N{\fs9}{\c&HFF00&}This text should be small and green{\c}{\fs}\N{\fs9}{\c&HFF&}This text should be small and red{\c}{\fs}\N{\fs24}{\c&H2A2AA5&}This text should be big and brown{\c}{\fs} +Dialogue: 0,0:00:07.50,0:00:11.50,Default,,0,0,0,,This should be an E with an accent: È\N日本語\N{\fs30}{\b1}{\i1}{\u1}This text should be bold, italics and underline{\u0}{\i0}{\b0}{\fs}\N{\fs9}{\c&HFF00&}This text should be small and green{\fs}{\c}\N{\c&HFF&}{\fs9}This text should be small and red{\fs}{\c}\N{\c&H2A2AA5&}{\fs24}This text should be big and brown{\fs}{\c} Dialogue: 0,0:00:11.50,0:00:14.50,Default,,0,0,0,,{\b1}This line should be bold{\b0}\N{\i1}This line should be italics{\i0}\N{\u1}This line should be underline{\u0}\N{\s1}This line should be strikethrough{\s0}\N{\u1}Both lines\Nshould be underline{\u0} -Dialogue: 0,0:00:14.50,0:00:17.50,Default,,0,0,0,,>\NIt would be a good thing to\Nhide invalid html tags that are closed and show the text in them\Nbut show un-closed invalid html tags\NShow not opened tags\N< -Dialogue: 0,0:00:17.50,0:00:20.50,Default,,0,0,0,,and also\Nhide invalid html tags with parameters that are closed and show the text in them\Nbut show un-closed invalid html tags\N{\u1}This text should be showed underlined without problems also: 2<3,5>1,4<6{\u0}\NThis shouldn't be underlined +Dialogue: 0,0:00:14.50,0:00:17.50,Default,,0,0,0,,>\NIt would be a good thing to\Nhide invalid html tags that are closed and show the text in them\Nbut show un-closed invalid html tags\NShow not opened tags\N< +Dialogue: 0,0:00:17.50,0:00:20.50,Default,,0,0,0,,and also\Nhide invalid html tags with parameters that are closed and show the text in them\Nbut show un-closed invalid html tags\N{\u1}This text should be showed underlined without problems also: 2<3,5>1,4<6{\u0}\NThis shouldn't be underlined Dialogue: 0,0:00:20.50,0:00:21.50,Default,,0,0,0,,This text should be in the normal position... Dialogue: 0,0:00:21.50,0:00:22.50,Default,,0,0,0,,{\an5}{\pos(0,45)}This text should NOT be in the normal position Dialogue: 0,0:00:22.50,0:00:24.50,Default,,0,0,0,,Implementation is the same of the ASS tag\N{\an8}This text should be at the\Ntop and horizontally centered @@ -29,7 +29,7 @@ Dialogue: 0,0:00:24.50,0:00:26.50,Default,,0,0,0,,{\an1}This text should be at t Dialogue: 0,0:00:26.50,0:00:28.50,Default,,0,0,0,,{\an9}This text should be at the\Ntop and horizontally at the right Dialogue: 0,0:00:26.50,0:00:28.50,Default,,0,0,0,,{\an6}This text should be at the\Nmiddle and horizontally at the right Dialogue: 0,0:00:26.50,0:00:28.50,Default,,0,0,0,,{\an3}This text should be at the\Nbottom and horizontally at the right -Dialogue: 0,0:00:28.50,0:00:31.50,Default,,0,0,0,,{\fs6}{\c&HFF00&}This could be the {\fs35}m{\c&H0&}o{\c&HFF00&}st{\fs6} difficult thing to implement{\c}{\fs} +Dialogue: 0,0:00:28.50,0:00:31.50,Default,,0,0,0,,{\c&HFF00&}{\fs6}This could be the {\fs35}m{\c&H0&}o{\c&HFF00&}st{\fs6} difficult thing to implement{\fs}{\c} Dialogue: 0,0:00:31.50,0:00:50.50,Default,,0,0,0,,First text Dialogue: 0,0:00:33.50,0:00:35.50,Default,,0,0,0,,Second, it shouldn't overlap first Dialogue: 0,0:00:35.50,0:00:37.50,Default,,0,0,0,,Third, it should replace second @@ -44,6 +44,6 @@ Dialogue: 0,0:00:54.50,0:00:56.50,Default,,0,0,0,,{\an1}\N\h\h\h\h\hA (05 hard s Dialogue: 0,0:00:56.50,0:00:58.50,Default,,0,0,0,,\h\h\h\h\hA (05 hard spaces followed by a letter)\NA (Normal spaces followed by a letter)\NA (No hard spaces followed by a letter)\NShow this: \TEST and this: \-) Dialogue: 0,0:00:58.50,0:01:00.50,Default,,0,0,0,,{\an3}\NA letter followed by 05 hard spaces: A\h\h\h\h\h\NA letter followed by normal spaces: A\NA letter followed by no hard spaces: A\N05 hard spaces between letters: A\h\h\h\h\hA\N5 normal spaces between letters: A A\N\N^--Forced line break Dialogue: 0,0:01:00.50,0:01:02.50,Default,,0,0,0,,{\s1}Both line should be strikethrough,\Nyes.{\s0}\NCorrectly closed tags\Nshould be hidden. -Dialogue: 0,0:01:02.50,0:01:04.50,Default,,0,0,0,,It shouldn't be strikethrough,\Nnot opened tag showed as text.\NNot opened tag showed as text. -Dialogue: 0,0:01:04.50,0:01:06.50,Default,,0,0,0,,{\s1}Three lines should be strikethrough,\Nyes.\NNot closed tags showed as text -Dialogue: 0,0:01:06.50,0:01:08.50,Default,,0,0,0,,{\s1}Both line should be strikethrough but\Nthe wrong closing tag should be showed +Dialogue: 0,0:01:02.50,0:01:04.50,Default,,0,0,0,,It shouldn't be strikethrough,\Nnot opened tag showed as text.{\s0}\NNot opened tag showed as text. +Dialogue: 0,0:01:04.50,0:01:06.50,Default,,0,0,0,,{\s1}Three lines should be strikethrough,\Nyes.\NNot closed tags showed as text +Dialogue: 0,0:01:06.50,0:01:08.50,Default,,0,0,0,,{\s1}Both line should be strikethrough but\Nthe wrong closing tag should be showed{\b0} diff --git a/tests/ref/fate/sub-srt-badsyntax b/tests/ref/fate/sub-srt-badsyntax index 1ee2f95dcc..791e00e691 100644 --- a/tests/ref/fate/sub-srt-badsyntax +++ b/tests/ref/fate/sub-srt-badsyntax @@ -14,8 +14,8 @@ Dialogue: 0,0:00:01.00,0:00:02.31,Default,,0,0,0,,<<<<<< Antes <<<<<<\NEla é hu Dialogue: 0,0:01:10.00,0:01:14.50,Default,,0,0,0,,>>> RebelSubTeam <<< Dialogue: 0,0:02:37.75,0:02:43.70,Default,,0,0,0,,{\b1}~ASUKO MARCH!~\N>>:<<\Ntranslation by: cangii\NRetiming by: furransu{\b0} Dialogue: 0,0:03:38.32,0:03:42.78,Default,,0,0,0,,<>\N<> -Dialogue: 0,0:04:50.43,0:05:01.03,Default,,0,0,0,,<pisode 4\Nsaison 1>\Nwww.SeriesSub.com> -Dialogue: 0,0:20:31.85,0:20:56.84,Default,,0,0,0,,{\c&HFFFFFFFF&}\N<<<>>>{\c} +Dialogue: 0,0:04:50.43,0:05:01.03,Default,,0,0,0,,<\Nwww.SeriesSub.com> +Dialogue: 0,0:20:31.85,0:20:56.84,Default,,0,0,0,,\N<<<>>> Dialogue: 0,0:37:59.69,0:38:01.59,Default,,0,0,0,,mint asztalt foglaltatni\Na <<>Le Cirque-ben. Dialogue: 0,0:53:43.78,0:53:45.94,Default,,0,0,0,,<> - he calmed himself. Dialogue: 0,0:53:46.22,0:53:49.09,Default,,0,0,0,,< It would be a good thing to hide invalid html tags that are closed and show the text in them -but show un-closed invalid html tags -Show not opened tags +but show un-closed invalid html tags +Show not opened tags < 8 00:00:17,501 --> 00:00:20,500 and also hide invalid html tags with parameters that are closed and show the text in them -but show un-closed invalid html tags +but show un-closed invalid html tags This text should be showed underlined without problems also: 2<3,5>1,4<6 This shouldn't be underlined @@ -197,17 +197,17 @@ should be hidden. 35 00:01:02,501 --> 00:01:04,500 It shouldn't be strikethrough, -not opened tag showed as text. -Not opened tag showed as text. +not opened tag showed as text. +Not opened tag showed as text. 36 00:01:04,501 --> 00:01:06,500 Three lines should be strikethrough, yes. -Not closed tags showed as text +Not closed tags showed as text 37 00:01:06,501 --> 00:01:08,500 Both line should be strikethrough but -the wrong closing tag should be showed +the wrong closing tag should be showed diff --git a/tests/ref/fate/sub-webvttenc b/tests/ref/fate/sub-webvttenc index ba567c33f6..45ae0b6131 100644 --- a/tests/ref/fate/sub-webvttenc +++ b/tests/ref/fate/sub-webvttenc @@ -39,14 +39,14 @@ should be underline > It would be a good thing to hide invalid html tags that are closed and show the text in them -but show un-closed invalid html tags -Show not opened tags +but show un-closed invalid html tags +Show not opened tags < 00:17.501 --> 00:20.500 and also hide invalid html tags with parameters that are closed and show the text in them -but show un-closed invalid html tags +but show un-closed invalid html tags This text should be showed underlined without problems also: 2<3,5>1,4<6 This shouldn't be underlined @@ -164,14 +164,14 @@ should be hidden. 01:02.501 --> 01:04.500 It shouldn't be strikethrough, -not opened tag showed as text. -Not opened tag showed as text. +not opened tag showed as text. +Not opened tag showed as text. 01:04.501 --> 01:06.500 Three lines should be strikethrough, yes. -Not closed tags showed as text +Not closed tags showed as text 01:06.501 --> 01:08.500 Both line should be strikethrough but -the wrong closing tag should be showed +the wrong closing tag should be showed