diff --git a/components/fpspreadsheet/fpsconvencoding.pas b/components/fpspreadsheet/fpsconvencoding.pas new file mode 100644 index 000000000..64b3de550 --- /dev/null +++ b/components/fpspreadsheet/fpsconvencoding.pas @@ -0,0 +1,507 @@ +{ + ***************************************************************************** + * * + * This file is part of the Lazarus Component Library (LCL) * + * * + * See the file COPYING.modifiedLGPL.txt, included in this distribution, * + * for details about the copyright. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * * + ***************************************************************************** +} +unit fpsconvencoding; + +{$mode objfpc}{$H+} +//{$IFDEF WINDOWS} +//{$WARNING Windows/Wine/ReactOS locale conversion is not fully supported yet. Sorry.} +//{$ENDIF} + +interface + +//{$IFNDEF DisableIconv} +//{$IFDEF UNIX}{$IF not defined(VER2_2_0) and not defined(VER2_2_2)}{$DEFINE HasIconvEnc}{$ENDIF}{$ENDIF} +//{$ENDIF} + +uses + SysUtils, Classes, dos + {$IFDEF HasIconvEnc},iconvenc{$ENDIF}; + +const + EncodingUTF8 = 'utf8'; + EncodingAnsi = 'ansi'; + EncodingUTF8BOM = 'utf8bom'; // UTF-8 with byte order mark + EncodingUCS2LE = 'ucs2le'; // UCS 2 byte little endian + EncodingUCS2BE = 'ucs2le'; // UCS 2 byte big endian + +type + TConvertEncodingFunction = function(const s: string): string; + TCharToUTF8Table = array[char] of PChar; + TUnicodeToCharID = function(Unicode: cardinal): integer; +var + ConvertAnsiToUTF8: TConvertEncodingFunction = nil; + ConvertUTF8ToAnsi: TConvertEncodingFunction = nil; + +function SingleByteToUTF8(const s: string; const Table: TCharToUTF8Table): string; +function UTF8BOMToUTF8(const s: string): string; // UTF8 with BOM +function ISO_8859_1ToUTF8(const s: string): string; // central europe + +function UTF8ToSingleByte(const s: string; + const UTF8CharConvFunc: TUnicodeToCharID): string; +function UTF8ToISO_8859_1(const s: string): string; // central europe + +function UTF8CharacterToUnicode(p: PChar; out CharLen: integer): Cardinal; + +implementation + +const + ArrayISO_8859_1ToUTF8: TCharToUTF8Table = ( + #0, // #0 + #1, // #1 + #2, // #2 + #3, // #3 + #4, // #4 + #5, // #5 + #6, // #6 + #7, // #7 + #8, // #8 + #9, // #9 + #10, // #10 + #11, // #11 + #12, // #12 + #13, // #13 + #14, // #14 + #15, // #15 + #16, // #16 + #17, // #17 + #18, // #18 + #19, // #19 + #20, // #20 + #21, // #21 + #22, // #22 + #23, // #23 + #24, // #24 + #25, // #25 + #26, // #26 + #27, // #27 + #28, // #28 + #29, // #29 + #30, // #30 + #31, // #31 + ' ', // ' ' + '!', // '!' + '"', // '"' + '#', // '#' + '$', // '$' + '%', // '%' + '&', // '&' + '''', // '''' + '(', // '(' + ')', // ')' + '*', // '*' + '+', // '+' + ',', // ',' + '-', // '-' + '.', // '.' + '/', // '/' + '0', // '0' + '1', // '1' + '2', // '2' + '3', // '3' + '4', // '4' + '5', // '5' + '6', // '6' + '7', // '7' + '8', // '8' + '9', // '9' + ':', // ':' + ';', // ';' + '<', // '<' + '=', // '=' + '>', // '>' + '?', // '?' + '@', // '@' + 'A', // 'A' + 'B', // 'B' + 'C', // 'C' + 'D', // 'D' + 'E', // 'E' + 'F', // 'F' + 'G', // 'G' + 'H', // 'H' + 'I', // 'I' + 'J', // 'J' + 'K', // 'K' + 'L', // 'L' + 'M', // 'M' + 'N', // 'N' + 'O', // 'O' + 'P', // 'P' + 'Q', // 'Q' + 'R', // 'R' + 'S', // 'S' + 'T', // 'T' + 'U', // 'U' + 'V', // 'V' + 'W', // 'W' + 'X', // 'X' + 'Y', // 'Y' + 'Z', // 'Z' + '[', // '[' + '\', // '\' + ']', // ']' + '^', // '^' + '_', // '_' + '`', // '`' + 'a', // 'a' + 'b', // 'b' + 'c', // 'c' + 'd', // 'd' + 'e', // 'e' + 'f', // 'f' + 'g', // 'g' + 'h', // 'h' + 'i', // 'i' + 'j', // 'j' + 'k', // 'k' + 'l', // 'l' + 'm', // 'm' + 'n', // 'n' + 'o', // 'o' + 'p', // 'p' + 'q', // 'q' + 'r', // 'r' + 's', // 's' + 't', // 't' + 'u', // 'u' + 'v', // 'v' + 'w', // 'w' + 'x', // 'x' + 'y', // 'y' + 'z', // 'z' + '{', // '{' + '|', // '|' + '}', // '}' + '~', // '~' + #127, // #127 + #194#128, // #128 + #194#129, // #129 + #194#130, // #130 + #194#131, // #131 + #194#132, // #132 + #194#133, // #133 + #194#134, // #134 + #194#135, // #135 + #194#136, // #136 + #194#137, // #137 + #194#138, // #138 + #194#139, // #139 + #194#140, // #140 + #194#141, // #141 + #194#142, // #142 + #194#143, // #143 + #194#144, // #144 + #194#145, // #145 + #194#146, // #146 + #194#147, // #147 + #194#148, // #148 + #194#149, // #149 + #194#150, // #150 + #194#151, // #151 + #194#152, // #152 + #194#153, // #153 + #194#154, // #154 + #194#155, // #155 + #194#156, // #156 + #194#157, // #157 + #194#158, // #158 + #194#159, // #159 + #194#160, // #160 + #194#161, // #161 + #194#162, // #162 + #194#163, // #163 + #194#164, // #164 + #194#165, // #165 + #194#166, // #166 + #194#167, // #167 + #194#168, // #168 + #194#169, // #169 + #194#170, // #170 + #194#171, // #171 + #194#172, // #172 + #194#173, // #173 + #194#174, // #174 + #194#175, // #175 + #194#176, // #176 + #194#177, // #177 + #194#178, // #178 + #194#179, // #179 + #194#180, // #180 + #194#181, // #181 + #194#182, // #182 + #194#183, // #183 + #194#184, // #184 + #194#185, // #185 + #194#186, // #186 + #194#187, // #187 + #194#188, // #188 + #194#189, // #189 + #194#190, // #190 + #194#191, // #191 + #195#128, // #192 + #195#129, // #193 + #195#130, // #194 + #195#131, // #195 + #195#132, // #196 + #195#133, // #197 + #195#134, // #198 + #195#135, // #199 + #195#136, // #200 + #195#137, // #201 + #195#138, // #202 + #195#139, // #203 + #195#140, // #204 + #195#141, // #205 + #195#142, // #206 + #195#143, // #207 + #195#144, // #208 + #195#145, // #209 + #195#146, // #210 + #195#147, // #211 + #195#148, // #212 + #195#149, // #213 + #195#150, // #214 + #195#151, // #215 + #195#152, // #216 + #195#153, // #217 + #195#154, // #218 + #195#155, // #219 + #195#156, // #220 + #195#157, // #221 + #195#158, // #222 + #195#159, // #223 + #195#160, // #224 + #195#161, // #225 + #195#162, // #226 + #195#163, // #227 + #195#164, // #228 + #195#165, // #229 + #195#166, // #230 + #195#167, // #231 + #195#168, // #232 + #195#169, // #233 + #195#170, // #234 + #195#171, // #235 + #195#172, // #236 + #195#173, // #237 + #195#174, // #238 + #195#175, // #239 + #195#176, // #240 + #195#177, // #241 + #195#178, // #242 + #195#179, // #243 + #195#180, // #244 + #195#181, // #245 + #195#182, // #246 + #195#183, // #247 + #195#184, // #248 + #195#185, // #249 + #195#186, // #250 + #195#187, // #251 + #195#188, // #252 + #195#189, // #253 + #195#190, // #254 + #195#191 // #255 + ); + +function UTF8BOMToUTF8(const s: string): string; +begin + Result:=copy(s,4,length(s)); +end; + +function ISO_8859_1ToUTF8(const s: string): string; +begin + Result:=SingleByteToUTF8(s,ArrayISO_8859_1ToUTF8); +end; + +function SingleByteToUTF8(const s: string; const Table: TCharToUTF8Table + ): string; +var + len: Integer; + i: Integer; + Src: PChar; + Dest: PChar; + p: PChar; + c: Char; +begin + if s='' then begin + Result:=s; + exit; + end; + len:=length(s); + SetLength(Result,len*4);// UTF-8 is at most 4 bytes + Src:=PChar(s); + Dest:=PChar(Result); + for i:=1 to len do begin + c:=Src^; + inc(Src); + if ord(c)<128 then begin + Dest^:=c; + inc(Dest); + end else begin + p:=Table[c]; + if p<>nil then begin + while p^<>#0 do begin + Dest^:=p^; + inc(p); + inc(Dest); + end; + end; + end; + end; + SetLength(Result,PtrUInt(Dest)-PtrUInt(Result)); +end; + +function UnicodeToISO_8859_1(Unicode: cardinal): integer; +begin + case Unicode of + 0..255: Result:=Unicode; + else Result:=-1; + end; +end; + +function UTF8ToUTF8BOM(const s: string): string; +begin + Result:=#$EF#$BB#$BF+s; +end; + +function UTF8ToISO_8859_1(const s: string): string; +begin + Result:=UTF8ToSingleByte(s,@UnicodeToISO_8859_1); +end; + + +function UTF8ToSingleByte(const s: string; + const UTF8CharConvFunc: TUnicodeToCharID): string; +var + len: Integer; + Src: PChar; + Dest: PChar; + c: Char; + Unicode: LongWord; + CharLen: integer; + i: integer; +begin + if s='' then begin + Result:=''; + exit; + end; + len:=length(s); + SetLength(Result,len); + Src:=PChar(s); + Dest:=PChar(Result); + while len>0 do begin + c:=Src^; + if c<#128 then begin + Dest^:=c; + inc(Dest); + inc(Src); + dec(len); + end else begin + Unicode:=UTF8CharacterToUnicode(Src,CharLen); + inc(Src,CharLen); + dec(len,CharLen); + i:=UTF8CharConvFunc(Unicode); + if i>=0 then begin + Dest^:=chr(i); + inc(Dest); + end; + end; + end; + SetLength(Result,Dest-PChar(Result)); +end; + + +procedure GetSupportedEncodings(List: TStrings); +begin + List.Add('UTF-8'); + List.Add('UTF-8BOM'); + List.Add('Ansi'); + List.Add('CP1250'); + List.Add('CP1251'); + List.Add('CP1252'); + List.Add('CP1253'); + List.Add('CP1254'); + List.Add('CP1255'); + List.Add('CP1256'); + List.Add('CP1257'); + List.Add('CP1258'); + List.Add('CP866'); + List.Add('CP874'); + List.Add('ISO-8859-1'); + List.Add('KOI-8'); + List.Add('UCS-2LE'); + List.Add('UCS-2BE'); +end; + +function UTF8CharacterToUnicode(p: PChar; out CharLen: integer): Cardinal; +begin + if p<>nil then begin + if ord(p^)<%11000000 then begin + // regular single byte character (#0 is a normal char, this is pascal ;) + Result:=ord(p^); + CharLen:=1; + end + else if ((ord(p^) and %11100000) = %11000000) then begin + // could be double byte character + if (ord(p[1]) and %11000000) = %10000000 then begin + Result:=((ord(p^) and %00011111) shl 6) + or (ord(p[1]) and %00111111); + CharLen:=2; + end else begin + Result:=ord(p^); + CharLen:=1; + end; + end + else if ((ord(p^) and %11110000) = %11100000) then begin + // could be triple byte character + if ((ord(p[1]) and %11000000) = %10000000) + and ((ord(p[2]) and %11000000) = %10000000) then begin + Result:=((ord(p^) and %00011111) shl 12) + or ((ord(p[1]) and %00111111) shl 6) + or (ord(p[2]) and %00111111); + CharLen:=3; + end else begin + Result:=ord(p^); + CharLen:=1; + end; + end + else if ((ord(p^) and %11111000) = %11110000) then begin + // could be 4 byte character + if ((ord(p[1]) and %11000000) = %10000000) + and ((ord(p[2]) and %11000000) = %10000000) + and ((ord(p[3]) and %11000000) = %10000000) then begin + Result:=((ord(p^) and %00001111) shl 18) + or ((ord(p[1]) and %00111111) shl 12) + or ((ord(p[2]) and %00111111) shl 6) + or (ord(p[3]) and %00111111); + CharLen:=4; + end else begin + Result:=ord(p^); + CharLen:=1; + end; + end + else begin + // invalid character + Result:=ord(p^); + CharLen:=1; + end; + end else begin + Result:=0; + CharLen:=0; + end; +end; + +end. diff --git a/components/fpspreadsheet/fpspreadsheet.pas b/components/fpspreadsheet/fpspreadsheet.pas index 606a54aba..83278799c 100755 --- a/components/fpspreadsheet/fpspreadsheet.pas +++ b/components/fpspreadsheet/fpspreadsheet.pas @@ -450,7 +450,7 @@ begin //cctFormula cctNumber: Result := FloatToStr(ACell^.NumberValue); - cctUTF8String: Result := UTF8ToAnsi(ACell^.UTF8StringValue); + cctUTF8String: Result := ACell^.UTF8StringValue; else Result := ''; end; diff --git a/components/fpspreadsheet/laz_fpspreadsheet.lpk b/components/fpspreadsheet/laz_fpspreadsheet.lpk index 62b7b9ad1..69667f7cf 100644 --- a/components/fpspreadsheet/laz_fpspreadsheet.lpk +++ b/components/fpspreadsheet/laz_fpspreadsheet.lpk @@ -7,14 +7,14 @@ - + - + @@ -87,6 +87,10 @@ + + + + diff --git a/components/fpspreadsheet/laz_fpspreadsheet.pas b/components/fpspreadsheet/laz_fpspreadsheet.pas index 32374c3ec..7c8dcafbd 100644 --- a/components/fpspreadsheet/laz_fpspreadsheet.pas +++ b/components/fpspreadsheet/laz_fpspreadsheet.pas @@ -7,11 +7,11 @@ unit laz_fpspreadsheet; interface uses - fpolestorage, fpsallformats, fpsopendocument, fpspreadsheet, xlsbiff2, + fpolestorage, fpsallformats, fpsopendocument, fpspreadsheet, xlsbiff2, xlsbiff5, xlsbiff8, xlsxooxml, fpsutils, fpszipper, uvirtuallayer_types, uvirtuallayer, uvirtuallayer_ole, uvirtuallayer_ole_helpers, uvirtuallayer_ole_types, uvirtuallayer_stream, fpolebasic, xlscommon, - LazarusPackageIntf; + fpsconvencoding, LazarusPackageIntf; implementation diff --git a/components/fpspreadsheet/xlsbiff2.pas b/components/fpspreadsheet/xlsbiff2.pas index 58623f855..27739bc85 100755 --- a/components/fpspreadsheet/xlsbiff2.pas +++ b/components/fpspreadsheet/xlsbiff2.pas @@ -31,7 +31,7 @@ interface uses Classes, SysUtils, - fpspreadsheet, xlscommon, fpsutils; + fpspreadsheet, xlscommon, fpsutils, fpsconvencoding; type @@ -283,7 +283,7 @@ var begin if AValue = '' then Exit; // Writing an empty text doesn't work - AnsiText := UTF8ToAnsi(AValue); + AnsiText := UTF8ToISO_8859_1(AValue); L := Length(AnsiText); { BIFF Record header } @@ -400,7 +400,7 @@ begin AStrValue := AValue; { Save the data } - FWorksheet.WriteUTF8Text(ARow, ACol, AnsiToUTF8(AStrValue)); + FWorksheet.WriteUTF8Text(ARow, ACol, ISO_8859_1ToUTF8(AStrValue)); end; procedure TsSpreadBIFF2Reader.ReadNumber(AStream: TStream); diff --git a/components/fpspreadsheet/xlsbiff5.pas b/components/fpspreadsheet/xlsbiff5.pas index e3d6bc7a0..f71af693d 100755 --- a/components/fpspreadsheet/xlsbiff5.pas +++ b/components/fpspreadsheet/xlsbiff5.pas @@ -64,7 +64,7 @@ uses {$else} fpolestorage, {$endif} - fpsutils; + fpsutils, fpsconvencoding; type @@ -457,7 +457,7 @@ var Len: Byte; LatinSheetName: string; begin - LatinSheetName := UTF8ToAnsi(ASheetName); // Should actually be UTF-8 to Latin 1 ISO + LatinSheetName := UTF8ToISO_8859_1(ASheetName); Len := Length(LatinSheetName); { BIFF Record header } @@ -734,7 +734,7 @@ var L: Word; AnsiValue: ansistring; begin - AnsiValue := UTF8ToAnsi(AValue); + AnsiValue := UTF8ToISO_8859_1(AValue); if AnsiValue = '' then begin // Bad formatted UTF8String (maybe ANSI?) @@ -1118,7 +1118,7 @@ begin AStream.ReadBuffer(AStrValue[1], L); { Save the data } - FWorksheet.WriteUTF8Text(ARow, ACol, AnsiToUTF8(AStrValue)); + FWorksheet.WriteUTF8Text(ARow, ACol, ISO_8859_1ToUTF8(AStrValue)); //Read formatting runs (not supported) B:=AStream.ReadByte; for L := 0 to B-1 do begin @@ -1293,7 +1293,7 @@ begin AStrValue := AValue; { Save the data } - FWorksheet.WriteUTF8Text(ARow, ACol, AnsiToUTF8(AStrValue)); + FWorksheet.WriteUTF8Text(ARow, ACol, ISO_8859_1ToUTF8(AStrValue)); end; procedure TsSpreadBIFF5Reader.ReadNumber(AStream: TStream);