kolmck/WideStringEdit.pas

509 lines
12 KiB
ObjectPascal

unit WideStringEdit;
interface
function WStrLComp(const Str1, Str2: PWideChar; MaxLen: Cardinal): Integer;
{* Compare two strings (fast). Terminating 0 is not considered, so if
strings are equal, comparing is continued up to MaxLen bytes.
Since this, pass minimum of lengths as MaxLen. }
function WS2Int( S: PWideChar ): Integer;
{* Converts null-terminated string to Integer. Scanning stopped when any
non-digit character found. Even empty string or string not containing
valid integer number silently converted to 0. }
function UTF8ToUCS2(Dest: PWideChar; MaxDestBytes: Cardinal;
Source: PChar; SourceChars: Cardinal): Cardinal;
(* Decode string from UTF8 to UCS2 *)
function UCS2ToUTF8(Dest: PChar; MaxDestBytes: Cardinal;
Source: PWideChar; SourceChars: Cardinal): Cardinal;
(* Decode string from UCS2 to UTF8 *)
implementation
function WStrLComp(const Str1, Str2: PWideChar; MaxLen: Cardinal): Integer; assembler;
asm
OR ECX,ECX
JE @@1
PUSH EDI
PUSH ESI
PUSH EBX
MOV EDI,EDX
MOV ESI,EAX
MOV EBX,ECX
XOR EAX,EAX
REPNE SCASW
SUB EBX,ECX
MOV ECX,EBX
MOV EDI,EDX
XOR EDX,EDX
REPE CMPSW
MOV AX,[ESI-2]
MOV DX,[EDI-2]
SUB EAX,EDX
POP EBX
POP ESI
POP EDI
@@1:
end;
function WS2Int( S: PWideChar ): Integer;
//EAX: S
//Result: Integer -> EAX
asm
XCHG EDX, EAX
XOR EAX, EAX
TEST EDX, EDX
JZ @@exit
XOR ECX, ECX
MOV CX, [EDX]
ADD EDX, 2
CMP CX, '-'
PUSHFD
JE @@0
@@1: CMP CX, '+'
JNE @@2
@@0: MOV CX, [EDX]
ADD EDX, 2
@@2: SUB CX, '0'
CMP CX, '9'-'0'
JA @@fin
LEA EAX, [EAX+EAX*4] //
LEA EAX, [ECX+EAX*2] //
JMP @@0
@@fin: POPFD
JNE @@exit
NEG EAX
@@exit:
end;
function UTF8ToUCS2(Dest: PWideChar; MaxDestBytes: Cardinal;
Source: PChar; SourceChars: Cardinal): Cardinal;
//EAX: @Dest
//EDX: MaxDestBytes
//ECX: @Source
//(ESP): SourceChars;
//Result: DestChars of @Dest -> EAX
asm
//backup
PUSHF
CLD //set (ESI)+
PUSH EBX
PUSH ESI
PUSH EDI
PUSH Dest //backup @Dst
MOV EDI, Dest
TEST Source, Source //test NULL string
JZ @Exit
MOV ESI, Source
MOV ECX, SourceChars
@NextChar:
//test length of Dst
SUB EDX, 2
JLE @Exit
//get next char to EAX
XOR EAX, EAX
LODSB //MOV AL, [ESI]+
//test NULL char (end of string)
TEST AL, AL
JZ @Exit
//decode UTF8 to UCS2
@Utf8ToUcs2:
//test first byte UTF8 = 0xxxxxxx
TEST AL, $80
JNZ @1xxxxxxx
//UTF8: 0xxxxxxx (AH = 0)
@SaveU16:
STOSW //MOVW [EDI]+, EAX
@Loop:
LOOP @NextChar
JMP @Exit
@1xxxxxxx:
//test first byte UTF8 = 10xxxxxx
TEST AL, $40 //01000000
JZ @Exit //Error UTF8: 10xxxxxx
//test first byte UTF8 = 1111xxxx
CMP AL, $F0 //11110000
JAE @Exit //Error UTF8 to UCS2: 1111xxxx ( if AL >= $F0)
//test exist second byte UTF8
JECXZ @Exit // DEC ECX; if ECX = 0
//backup first byte UTF8
MOV AH, AL //11xxxxxx
//load second byte UTF8
LODSB //MOV AL, [ESI]+
//test second byte UTF8 = 10xxxxxx
TEST AL, $40 //01000000
JNE @Exit //Error UTF8: 10xxxxxx
//test second byte UTF8 = 110xxxxx
TEST AH, $20 //00100000
JNZ @1110xxxx //third byte UTF8
//UTF8: 110xxxxx 10xxxxxx
//backup first byte UTF8
MOV BL, AH //110xxxxx
//get high byte UCS2
SHR AH, 2 //00110xxx
AND AX, $073F //AH: 00000xxx; AL: 00xxxxxx
//get low byte USC2
SHL BL, 6 //xx000000
OR AL, BL //xxxxxxxx
//AX: 00000xxx:xxxxxxxx
JMP @SaveU16
@1110xxxx:
//test exist third byte UTF8
JeCXZ @Exit // DEC ECX; if ECX = 0
//backup second byte UTF8
MOV BL, AL //10xxxxxx
//load third byte UTF8
LODSB //MOV AL, [ESI]+
//test third byte UTF8 = 10xxxxxx
CMP AL, $C0 //11000000
JAE @Exit //Error UTF8: 11xxxxxx ( if AL >= $C0)
//UTF8: 1110xxxx 10xxxxxx 10xxxxxx
//get bytes UCS2 ��: xx00000:0000xxxx
AND BX, $003F //DX := 00000000:00xxxxxx
ROR BX, 2 //BL := 0000xxxx; BH := xx000000
//get low byte UTF8
AND AL, $3F //00xxxxxx
OR AL, BH //xxxxxxxx
//get high byte UCS2
SHL AH, 4 //xxxx0000
OR AH, BL //xxxxxxxx
JMP @SaveU16
@Exit:
XOR EAX, EAX
MOV [EDI],AX //set end-char of Dst
POP EAX //restore @Dst
XCHG EAX, EDI
//get length of Dst to Result
SUB EAX, EDI
SHR EAX, 1
//restore
POP EDI
POP ESI
POP EBX
POPF
end;
function UCS2ToUTF8(Dest: PChar; MaxDestBytes: Cardinal;
Source: PWideChar; SourceChars: Cardinal): Cardinal;
//EAX: @Dest
//EDX: MaxDestBytes
//ECX: @Source
//(ESP): SourceChars;
//Result: DestChars of @Dest -> EAX
asm
//backup
PUSHF
CLD //set (ESI)+
PUSH EBX
PUSH ESI
PUSH EDI
PUSH Dest //backup @Dst
MOV EDI, Dest
TEST Source, Source //test NULL string
JZ @Exit
MOV ESI, Source
MOV ECX, SourceChars
@NextChar:
//test length of Dst
DEC EDX
JLE @Exit
//get next char to EAX
XOR EAX, EAX
LODSW //MOV AX, [ESI]+
//test NULL char (end of string)
TEST EAX, EAX
JZ @Exit
//decode UCS2 to UTF8
@Ucs2ToUtf8:
//test UCS2-char in $0000..$007F
CMP AX, $007F
JA @11xxxxxx //if AX > $7F
//UTF8-char: 0xxxxxxx
//AH = 00000000; AL = 0xxxxxxx
@0xxxxxxx:
//save UTF8-char
STOSB //MOVB [EDI]+, AL
//end Loop
@Loop:
LOOP @NextChar
JMP @Exit
@11xxxxxx:
//test length of Dst
DEC EDX
JLE @Exit
//test UCS2-char in $0080..$07FF
CMP AX, $07FF
JA @1110xxxx //if AX > $07FF
//UTF8-char: 110xxxxx 10xxxxxx
//AH = 00000xxx; AL = xxxxxxxx
//get first byte UTF8-char to AL
ROR AX, 6 //AH = xxxxxx00; AL = 000xxxxx
//get second byte UTF8-char to AH
SHR AH, 2 //AH = 00xxxxxx
OR AX, $80C0 //AH = 10xxxxxx; AL = 110xxxxx
//save UTF8-char
STOSW //MOVW [EDI]+, AX
JMP @Loop
//UTF8-char: 1110xxxx 10xxxxxx 10xxxxxx
@1110xxxx:
//test length of Dst
DEC EDX
JLE @Exit
//save lobyte of UCS2-char
MOV BL, AL
//AH = xxxxxxxx; AL = xxxxxxxx
//get first byte UTF8-char UTF8 to AL
ROL AX, 4 //AL = ????xxxx; AH = xxxxxx??
AND AL, $0F //AL = 0000xxxx
//get second byte UTF8-char to AH
SHR AH, 2 //AH = 00xxxxxx
OR AX, $80E0 //AH = 10xxxxxx; AL = 1110xxxx
//save first bytes UTF8-char
STOSW //MOVW [EDI]+, AX
//get second byte UTF8-char to AL
XCHG EAX, EBX //??xxxxxx
AND AL, $3F //00xxxxxx
OR AL, $80 //10xxxxxx
//save third byte UTF8-char
JMP @0xxxxxxx
@Exit:
MOV BYTE PTR [EDI], $00 //set end-char of Dst
POP EAX //restore @Dst
XCHG EAX, EDI
//get length of Dst to Result
SUB EAX, EDI
//restore
POP EDI
POP ESI
POP EBX
POPF
end;
END//Decode string from UTF8 to UCS2
function UTF8ToUCS2(Dest: PWideChar; MaxDestBytes: Cardinal;
Source: PChar; SourceChars: Cardinal): Cardinal;
//EAX: @Dest
//EDX: MaxDestBytes
//ECX: @Source
//(ESP): SourceChars;
//Result: DestChars of @Dest -> EAX
asm
//backup
PUSHF
CLD //set (ESI)+
PUSH EBX
PUSH ESI
PUSH EDI
PUSH Dest //backup @Dst
MOV EDI, Dest
TEST Source, Source //test NULL string
JZ @Exit
MOV ESI, Source
MOV ECX, SourceChars
@NextChar:
//test length of Dst
SUB EDX, 2
JLE @Exit
//get next char to EAX
XOR EAX, EAX
LODSB //MOV AL, [ESI]+
//test NULL char (end of string)
TEST AL, AL
JZ @Exit
//decode UTF8 to UCS2
@Utf8ToUcs2:
//test first byte UTF8 = 0xxxxxxx
TEST AL, $80
JNZ @1xxxxxxx
//UTF8: 0xxxxxxx (AH = 0)
@SaveU16:
STOSW //MOVW [EDI]+, EAX
@Loop:
LOOP @NextChar
JMP @Exit
@1xxxxxxx:
//test first byte UTF8 = 10xxxxxx
TEST AL, $40 //01000000
JZ @Exit //Error UTF8: 10xxxxxx
//test first byte UTF8 = 1111xxxx
CMP AL, $F0 //11110000
JAE @Exit //Error UTF8 to UCS2: 1111xxxx ( if AL >= $F0)
//test exist second byte UTF8
JECXZ @Exit // DEC ECX; if ECX = 0
//backup first byte UTF8
MOV AH, AL //11xxxxxx
//load second byte UTF8
LODSB //MOV AL, [ESI]+
//test second byte UTF8 = 10xxxxxx
TEST AL, $40 //01000000
JNE @Exit //Error UTF8: 10xxxxxx
//test second byte UTF8 = 110xxxxx
TEST AH, $20 //00100000
JNZ @1110xxxx //third byte UTF8
//UTF8: 110xxxxx 10xxxxxx
//backup first byte UTF8
MOV BL, AH //110xxxxx
//get high byte UCS2
SHR AH, 2 //00110xxx
AND AX, $073F //AH: 00000xxx; AL: 00xxxxxx
//get low byte USC2
SHL BL, 6 //xx000000
OR AL, BL //xxxxxxxx
//AX: 00000xxx:xxxxxxxx
JMP @SaveU16
@1110xxxx:
//test exist third byte UTF8
JeCXZ @Exit // DEC ECX; if ECX = 0
//backup second byte UTF8
MOV BL, AL //10xxxxxx
//load third byte UTF8
LODSB //MOV AL, [ESI]+
//test third byte UTF8 = 10xxxxxx
CMP AL, $C0 //11000000
JAE @Exit //Error UTF8: 11xxxxxx ( if AL >= $C0)
//UTF8: 1110xxxx 10xxxxxx 10xxxxxx
//get bytes UCS2 ��: xx00000:0000xxxx
AND BX, $003F //DX := 00000000:00xxxxxx
ROR BX, 2 //BL := 0000xxxx; BH := xx000000
//get low byte UTF8
AND AL, $3F //00xxxxxx
OR AL, BH //xxxxxxxx
//get high byte UCS2
SHL AH, 4 //xxxx0000
OR AH, BL //xxxxxxxx
JMP @SaveU16
@Exit:
XOR EAX, EAX
MOV [EDI],AX //set end-char of Dst
POP EAX //restore @Dst
XCHG EAX, EDI
//get length of Dst to Result
SUB EAX, EDI
SHR EAX, 1
//restore
POP EDI
POP ESI
POP EBX
POPF
end ; //asm
//Decode string from UCS2 to UTF8
function UCS2ToUTF8(Dest: PChar; MaxDestBytes: Cardinal;
Source: PWideChar; SourceChars: Cardinal): Cardinal;
//EAX: @Dest
//EDX: MaxDestBytes
//ECX: @Source
//(ESP): SourceChars;
//Result: DestChars of @Dest -> EAX
asm
//backup
PUSHF
CLD //set (ESI)+
PUSH EBX
PUSH ESI
PUSH EDI
PUSH Dest //backup @Dst
MOV EDI, Dest
TEST Source, Source //test NULL string
JZ @Exit
MOV ESI, Source
MOV ECX, SourceChars
@NextChar:
//test length of Dst
DEC EDX
JLE @Exit
//get next char to EAX
XOR EAX, EAX
LODSW //MOV AX, [ESI]+
//test NULL char (end of string)
TEST EAX, EAX
JZ @Exit
//decode UCS2 to UTF8
@Ucs2ToUtf8:
//test UCS2-char in $0000..$007F
CMP AX, $007F
JA @11xxxxxx //if AX > $7F
//UTF8-char: 0xxxxxxx
//AH = 00000000; AL = 0xxxxxxx
@0xxxxxxx:
//save UTF8-char
STOSB //MOVB [EDI]+, AL
//end Loop
@Loop:
LOOP @NextChar
JMP @Exit
@11xxxxxx:
//test length of Dst
DEC EDX
JLE @Exit
//test UCS2-char in $0080..$07FF
CMP AX, $07FF
JA @1110xxxx //if AX > $07FF
//UTF8-char: 110xxxxx 10xxxxxx
//AH = 00000xxx; AL = xxxxxxxx
//get first byte UTF8-char to AL
ROR AX, 6 //AH = xxxxxx00; AL = 000xxxxx
//get second byte UTF8-char to AH
SHR AH, 2 //AH = 00xxxxxx
OR AX, $80C0 //AH = 10xxxxxx; AL = 110xxxxx
//save UTF8-char
STOSW //MOVW [EDI]+, AX
JMP @Loop
//UTF8-char: 1110xxxx 10xxxxxx 10xxxxxx
@1110xxxx:
//test length of Dst
DEC EDX
JLE @Exit
//save lobyte of UCS2-char
MOV BL, AL
//AH = xxxxxxxx; AL = xxxxxxxx
//get first byte UTF8-char UTF8 to AL
ROL AX, 4 //AL = ????xxxx; AH = xxxxxx??
AND AL, $0F //AL = 0000xxxx
//get second byte UTF8-char to AH
SHR AH, 2 //AH = 00xxxxxx
OR AX, $80E0 //AH = 10xxxxxx; AL = 1110xxxx
//save first bytes UTF8-char
STOSW //MOVW [EDI]+, AX
//get second byte UTF8-char to AL
XCHG EAX, EBX //??xxxxxx
AND AL, $3F //00xxxxxx
OR AL, $80 //10xxxxxx
//save third byte UTF8-char
JMP @0xxxxxxx
@Exit:
MOV BYTE PTR [EDI], $00 //set end-char of Dst
POP EAX //restore @Dst
XCHG EAX, EDI
//get length of Dst to Result
SUB EAX, EDI
//restore
POP EDI
POP ESI
POP EBX
POPF
end;
end.