You've already forked FFmpeg
							
							
				mirror of
				https://github.com/FFmpeg/FFmpeg.git
				synced 2025-10-30 23:18:11 +02:00 
			
		
		
		
	lavu/avstring: add av_utf8_decode() function
This commit is contained in:
		| @@ -15,6 +15,9 @@ libavutil:     2012-10-22 | ||||
|  | ||||
| API changes, most recent first: | ||||
|  | ||||
| 2013-11-XX - xxxxxxx - lavu 52.54.100 - avstring.h | ||||
|   Add av_utf8_decode() function. | ||||
|  | ||||
| 2013-11-xx - xxxxxxx - lavc 55.44.100 - avcodec.h | ||||
|   Add av_packet_{un,}pack_dictionary() | ||||
|   Add AV_PKT_METADATA_UPDATE side data type, used to transmit key/value | ||||
|   | ||||
| @@ -157,6 +157,7 @@ TESTPROGS = adler32                                                     \ | ||||
|             sha                                                         \ | ||||
|             sha512                                                      \ | ||||
|             tree                                                        \ | ||||
|             utf8                                                        \ | ||||
|             xtea                                                        \ | ||||
|  | ||||
| TESTPROGS-$(HAVE_LZO1X_999_COMPRESS) += lzo | ||||
|   | ||||
| @@ -307,6 +307,70 @@ int av_isxdigit(int c) | ||||
|     return av_isdigit(c) || (c >= 'a' && c <= 'f'); | ||||
| } | ||||
|  | ||||
| int av_utf8_decode(int32_t *codep, const uint8_t **bufp, const uint8_t *buf_end, | ||||
|                    unsigned int flags) | ||||
| { | ||||
|     const uint8_t *p = *bufp; | ||||
|     uint32_t top; | ||||
|     uint64_t code; | ||||
|     int ret = 0; | ||||
|  | ||||
|     if (p >= buf_end) | ||||
|         return 0; | ||||
|  | ||||
|     code = *p++; | ||||
|  | ||||
|     /* first sequence byte starts with 10, or is 1111-1110 or 1111-1111, | ||||
|        which is not admitted */ | ||||
|     if ((code & 0xc0) == 0x80 || code >= 0xFE) { | ||||
|         ret = AVERROR(EILSEQ); | ||||
|         goto end; | ||||
|     } | ||||
|     top = (code & 128) >> 1; | ||||
|  | ||||
|     while (code & top) { | ||||
|         int tmp; | ||||
|         if (p >= buf_end) { | ||||
|             ret = AVERROR(EILSEQ); /* incomplete sequence */ | ||||
|             goto end; | ||||
|         } | ||||
|  | ||||
|         /* we assume the byte to be in the form 10xx-xxxx */ | ||||
|         tmp = *p++ - 128;   /* strip leading 1 */ | ||||
|         if (tmp>>6) { | ||||
|             ret = AVERROR(EILSEQ); | ||||
|             goto end; | ||||
|         } | ||||
|         code = (code<<6) + tmp; | ||||
|         top <<= 5; | ||||
|     } | ||||
|     code &= (top << 1) - 1; | ||||
|  | ||||
|     if (code >= 1<<31) { | ||||
|         ret = AVERROR(EILSEQ);  /* out-of-range value */ | ||||
|         goto end; | ||||
|     } | ||||
|  | ||||
|     *codep = code; | ||||
|  | ||||
|     if (code > 0x10FFFF && | ||||
|         !(flags & AV_UTF8_FLAG_ACCEPT_INVALID_BIG_CODES)) | ||||
|         ret = AVERROR(EILSEQ); | ||||
|     if (code < 0x20 && code != 0x9 && code != 0xA && code != 0xD && | ||||
|         flags & AV_UTF8_FLAG_EXCLUDE_XML_INVALID_CONTROL_CODES) | ||||
|         ret = AVERROR(EILSEQ); | ||||
|     if (code >= 0xD800 && code <= 0xDFFF && | ||||
|         !(flags & AV_UTF8_FLAG_ACCEPT_SURROGATES)) | ||||
|         ret = AVERROR(EILSEQ); | ||||
|     if (code == 0xFFFE || code == 0xFFFF && | ||||
|         (!flags & AV_UTF8_FLAG_ACCEPT_NON_CHARACTERS)) | ||||
|         ret = AVERROR(EILSEQ); | ||||
|  | ||||
| end: | ||||
|     *bufp = p; | ||||
|     return ret; | ||||
| } | ||||
|  | ||||
| #ifdef TEST | ||||
|  | ||||
| int main(void) | ||||
|   | ||||
| @@ -22,6 +22,7 @@ | ||||
| #define AVUTIL_AVSTRING_H | ||||
|  | ||||
| #include <stddef.h> | ||||
| #include <stdint.h> | ||||
| #include "attributes.h" | ||||
|  | ||||
| /** | ||||
| @@ -295,6 +296,45 @@ enum AVEscapeMode { | ||||
| int av_escape(char **dst, const char *src, const char *special_chars, | ||||
|               enum AVEscapeMode mode, int flags); | ||||
|  | ||||
| #define AV_UTF8_FLAG_ACCEPT_INVALID_BIG_CODES          1 ///< accept codepoints over 0x10FFFF | ||||
| #define AV_UTF8_FLAG_ACCEPT_NON_CHARACTERS             2 ///< accept non-characters - 0xFFFE and 0xFFFF | ||||
| #define AV_UTF8_FLAG_ACCEPT_SURROGATES                 4 ///< accept UTF-16 surrogates codes | ||||
| #define AV_UTF8_FLAG_EXCLUDE_XML_INVALID_CONTROL_CODES 8 ///< exclude control codes not accepted by XML | ||||
|  | ||||
| #define AV_UTF8_FLAG_ACCEPT_ALL \ | ||||
|     AV_UTF8_FLAG_ACCEPT_INVALID_BIG_CODES|AV_UTF8_FLAG_ACCEPT_NON_CHARACTERS|AV_UTF8_FLAG_ACCEPT_SURROGATES | ||||
|  | ||||
| /** | ||||
|  * Read and decode a single UTF-8 code point (character) from the | ||||
|  * buffer in *buf, and update *buf to point to the next byte to | ||||
|  * decode. | ||||
|  * | ||||
|  * In case of an invalid byte sequence, the pointer will be updated to | ||||
|  * the next byte after the invalid sequence and the function will | ||||
|  * return an error code. | ||||
|  * | ||||
|  * Depending on the specified flags, the function will also fail in | ||||
|  * case the decoded code point does not belong to a valid range. | ||||
|  * | ||||
|  * @note For speed-relevant code a carefully implemented use of | ||||
|  * GET_UTF8() may be preferred. | ||||
|  * | ||||
|  * @param codep   pointer used to return the parsed code in case of success. | ||||
|  *                The value in *codep is set even in case the range check fails. | ||||
|  * @param bufp    pointer to the address the first byte of the sequence | ||||
|  *                to decode, updated by the function to point to the | ||||
|  *                byte next after the decoded sequence | ||||
|  * @param buf_end pointer to the end of the buffer, points to the next | ||||
|  *                byte past the last in the buffer. This is used to | ||||
|  *                avoid buffer overreads (in case of an unfinished | ||||
|  *                UTF-8 sequence towards the end of the buffer). | ||||
|  * @param flags   a collection of AV_UTF8_FLAG_* flags | ||||
|  * @return >= 0 in case a sequence was successfully read, a negative | ||||
|  * value in case of invalid sequence | ||||
|  */ | ||||
| int av_utf8_decode(int32_t *codep, const uint8_t **bufp, const uint8_t *buf_end, | ||||
|                    unsigned int flags); | ||||
|  | ||||
| /** | ||||
|  * @} | ||||
|  */ | ||||
|   | ||||
							
								
								
									
										71
									
								
								libavutil/utf8.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										71
									
								
								libavutil/utf8.c
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,71 @@ | ||||
| /* | ||||
|  * Copyright (c) 2013 Stefano Sabatini | ||||
|  * | ||||
|  * This file is part of FFmpeg. | ||||
|  * | ||||
|  * FFmpeg is free software; you can redistribute it and/or | ||||
|  * modify it under the terms of the GNU Lesser General Public | ||||
|  * License as published by the Free Software Foundation; either | ||||
|  * version 2.1 of the License, or (at your option) any later version. | ||||
|  * | ||||
|  * FFmpeg is distributed in the hope that it will be useful, | ||||
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | ||||
|  * Lesser General Public License for more details. | ||||
|  * | ||||
|  * You should have received a copy of the GNU Lesser General Public | ||||
|  * License along with FFmpeg; if not, write to the Free Software | ||||
|  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||||
|  */ | ||||
|  | ||||
| #include <stdio.h> | ||||
|  | ||||
| #include "libavutil/avstring.h" | ||||
| #include "libavutil/file.h" | ||||
|  | ||||
| static void print_sequence(const char *p, int l, int indent) | ||||
| { | ||||
|     int i; | ||||
|     for (i = 0; i < l; i++) | ||||
|         printf("%02X", (uint8_t)p[i]); | ||||
|     printf("%*s", indent-l*2, ""); | ||||
| } | ||||
|  | ||||
| int main(int argc, char **argv) | ||||
| { | ||||
|     int ret; | ||||
|     char *filename = argv[1]; | ||||
|     uint8_t *file_buf; | ||||
|     size_t file_buf_size; | ||||
|     uint32_t code; | ||||
|     const uint8_t *p, *endp; | ||||
|  | ||||
|     ret = av_file_map(filename, &file_buf, &file_buf_size, 0, NULL); | ||||
|     if (ret < 0) | ||||
|         return 1; | ||||
|  | ||||
|     p = file_buf; | ||||
|     endp = file_buf + file_buf_size; | ||||
|     while (p < endp) { | ||||
|         int l, r; | ||||
|         const uint8_t *p0 = p; | ||||
|         code = UINT32_MAX; | ||||
|         r = av_utf8_decode(&code, &p, endp, 0); | ||||
|         l = (int)(p-p0); | ||||
|         print_sequence(p0, l, 20); | ||||
|         if (code != UINT32_MAX) { | ||||
|             printf("%-10d 0x%-10X %-5d ", code, code, l); | ||||
|             if (r >= 0) { | ||||
|                 if (*p0 == '\n') printf("\\n\n"); | ||||
|                 else             printf ("%.*s\n", l, p0); | ||||
|             } else { | ||||
|                 printf("invalid code range\n"); | ||||
|             } | ||||
|         } else { | ||||
|             printf("invalid sequence\n"); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     av_file_unmap(file_buf, file_buf_size); | ||||
|     return 0; | ||||
| } | ||||
| @@ -75,7 +75,7 @@ | ||||
|  */ | ||||
|  | ||||
| #define LIBAVUTIL_VERSION_MAJOR  52 | ||||
| #define LIBAVUTIL_VERSION_MINOR  53 | ||||
| #define LIBAVUTIL_VERSION_MINOR  54 | ||||
| #define LIBAVUTIL_VERSION_MICRO 100 | ||||
|  | ||||
| #define LIBAVUTIL_VERSION_INT   AV_VERSION_INT(LIBAVUTIL_VERSION_MAJOR, \ | ||||
|   | ||||
		Reference in New Issue
	
	Block a user