fpspreadsheet: HTML reader respects character encoding of file

git-svn-id: https://svn.code.sf.net/p/lazarus-ccr/svn@4256 8e941d3f-bd1b-0410-a28a-d453659cc2b4
This commit is contained in:
wp_xxyyzz
2015-08-06 10:19:57 +00:00
parent 8432eece02
commit 3e2a6b4bc6

View File

@ -27,6 +27,7 @@ type
FInSpan: Boolean;
FInA: Boolean;
FInHeader: Boolean;
FEncoding: String;
FTableCounter: Integer;
FCurrRow, FCurrCol: LongInt;
FCurrCellFormat: TsCellFormat;
@ -36,12 +37,13 @@ type
FAttrList: TsHTMLAttrList;
FColSpan, FRowSpan: Integer;
FHRef: String;
procedure ExtractBackgroundColor;
procedure ExtractFont(AFont: TsFont);
procedure ExtractHRef;
procedure ExtractHorAlign;
procedure ExtractMergedRange;
procedure ExtractVertAlign;
procedure ReadBackgroundColor;
procedure ReadEncoding;
procedure ReadFont(AFont: TsFont);
procedure ReadHRef;
procedure ReadHorAlign;
procedure ReadMergedRange;
procedure ReadVertAlign;
procedure InitFont(AFont: TsFont);
procedure InitCellFormat;
procedure TagFoundHandler(NoCaseTag, ActualTag: string);
@ -127,7 +129,7 @@ var
implementation
uses
LazUTF8, URIParser, StrUtils, Math,
LConvEncoding, LazUTF8, URIParser, StrUtils, Math,
fpsUtils, fpsNumFormat;
const
@ -140,6 +142,8 @@ const
constructor TsHTMLReader.Create(AWorkbook: TsWorkbook);
begin
inherited Create(AWorkbook);
FEncoding := EncodingUTF8;
FFormatSettings := HTMLParams.FormatSettings;
ReplaceFormatSettings(FFormatSettings, FWorkbook.FormatSettings);
@ -244,7 +248,7 @@ begin
FWorksheet.WriteUTF8Text(cell, AText);
end;
procedure TsHTMLReader.ExtractBackgroundColor;
procedure TsHTMLReader.ReadBackgroundColor;
var
idx: Integer;
begin
@ -260,7 +264,91 @@ begin
end;
end;
procedure TsHTMLReader.ExtractFont(AFont: TsFont);
procedure TsHTMLReader.ReadEncoding;
function FoundEncoding(AString: string): Boolean;
// https://encoding.spec.whatwg.org/#encodings
begin
Result := true;
case AString of
'utf-8', 'utf8', 'unicode-1-1-utf-8':
FEncoding := 'utf8';
'cp1250', 'windows-1250', 'x-cp1250':
FEncoding := 'cp1250';
'cp1251', 'windows-1251', 'x-cp1251':
FEncoding := 'cp1251';
'ansi_x3.4-1968', 'ascii', 'cp1252', 'cp819', 'csisolatin1', 'ibm819',
'iso-8859-1', 'iso-ir-100', 'iso8859-1', 'iso88591', 'iso_8859-1',
'iso_8859-1:1987', 'l1', 'latin1', 'us-ascii', 'windows-1252', 'x-cp1252':
FEncoding := 'cp1252';
'cp1253', 'windows-1253', 'x-cp1253':
FEncoding := 'cp1253';
'cp1254', 'csisolatin5', 'iso-8859-9', 'iso-ir-148', 'iso8859-9', 'iso88599',
'iso_8859-9', 'iso_8859-9:1989', 'l5', 'latin5', 'windows-1254', 'x-cp1254':
FEncoding := 'cp1254';
'cp1255', 'windows-1255', 'x-cp1255':
FEncoding := 'cp1255';
'cp1256', 'windows-1256', 'x-cp1256':
FEncoding := 'cp1256';
'cp1257', 'windows-1257', 'x-cp1257':
FEncoding := 'cp1257';
'cp1258', 'windows-1258', 'x-cp1258':
FEncoding := 'cp1258';
'866', 'cp866', 'csibm866', 'ibm866':
FEncoding := 'cp866';
'dos-874', 'iso-8859-11', 'iso8859-11', 'iso885911', 'tis-620', 'windows-874':
FEncoding := 'cp874';
'csisolatin2', 'iso-8859-2', 'iso-ir-101', 'iso8859-2', 'iso88592',
'iso_8859-2', 'iso_8859-2:1987', 'l2', 'latin2':
FEncoding := 'cpiso88592';
'csisolatin9', 'iso-8859-15', 'iso8859-15', 'iso885915', 'iso_8859-15', 'l9':
FEncoding := 'cpiso885915';
'csmacintosh', 'mac', 'macintosh', 'x-mac-roman':
FEncoding := 'mactintosh';
'cskoi8r', 'koi', 'koi8', 'koi8-r', 'koi8_r':
FEncoding := 'koi8';
'utf-16be':
FEncoding := 'ucs2be';
'utf-16', 'utf-16le':
FEncoding := 'ucs2le';
else
// Above site notes also some asian code pages which are not supported by
// Lazarus encoding utilities.
Result := false;
end;
end;
var
idx, p: Integer;
s: String;
begin
// HTML 5
idx := FAttrList.IndexOfName('charset');
if idx > -1 then begin
s := Lowercase(FAttrList[idx].Value);
if (s <> '') and FoundEncoding(s) then exit;
end;
// HTML 4
idx := FAttrList.IndexOfName('http-equiv');
if idx > -1 then
begin
idx := FAttrList.IndexOfName('content');
if idx > -1 then begin
s := Lowercase(FAttrList[idx].Value);
p := pos('charset', s);
if p > 0 then
begin
s := copy(s, p+Length('charset')+1, MaxInt);
p := pos(';', s);
if p > 0 then s := copy(s, 1, p-1);
if (s <> '') and FoundEncoding(s) then exit;
end;
end;
end;
end;
procedure TsHTMLReader.ReadFont(AFont: TsFont);
const
Factor = 1.2;
var
@ -350,7 +438,7 @@ begin
AFont.Color := HTMLColorStrToColor(FAttrList[idx].Value);
end;
procedure TsHTMLReader.ExtractHorAlign;
procedure TsHTMLReader.ReadHorAlign;
var
idx: Integer;
s: String;
@ -373,7 +461,7 @@ begin
end;
end;
procedure TsHTMLReader.ExtractHRef;
procedure TsHTMLReader.ReadHRef;
var
idx: Integer;
begin
@ -383,7 +471,7 @@ begin
FHRef := FAttrList[idx].Value;
end;
procedure TsHTMLReader.ExtractMergedRange;
procedure TsHTMLReader.ReadMergedRange;
var
idx: Integer;
begin
@ -398,7 +486,7 @@ begin
// -1 to compensate for correct determination of the range end cell
end;
procedure TsHTMLReader.ExtractVertAlign;
procedure TsHTMLReader.ReadVertAlign;
var
idx: Integer;
s: String;
@ -474,6 +562,11 @@ end;
procedure TsHTMLReader.TagFoundHandler(NoCaseTag, ActualTag: string);
begin
if pos('<META', NoCaseTag) = 1 then
begin
FAttrList.Parse(ActualTag);
ReadEncoding;
end else
if pos('<TABLE', NoCaseTag) = 1 then
begin
inc(FTableCounter);
@ -485,7 +578,7 @@ begin
FCurrCol := -1;
InitFont(FCurrFont);
FAttrList.Parse(ActualTag);
ExtractFont(FCurrFont);
ReadFont(FCurrFont);
FWorkbook.ReplaceFont(DEFAULT_FONTINDEX, FCurrFont.FontName, FCurrFont.Size,
FCurrFont.Style, FCurrFont.Color, FCurrFont.Position);
end else
@ -509,11 +602,11 @@ begin
FCellText := '';
InitCellFormat;
FAttrList.Parse(ActualTag);
ExtractMergedRange;
ExtractBackgroundColor;
ExtractHorAlign;
ExtractVertAlign;
ExtractFont(FCellFont);
ReadMergedRange;
ReadBackgroundColor;
ReadHorAlign;
ReadVertAlign;
ReadFont(FCellFont);
end else
if ((NoCaseTag = '<TH>') or (pos('<TH ', NoCaseTag) = 1)) and FInTable then
begin
@ -530,7 +623,7 @@ begin
begin
FInA := true;
FAttrList.Parse(ActualTag);
ExtractHRef;
ReadHRef;
end else
if (pos('<H', NoCaseTag) = 1) and (NoCaseTag[3] in ['1', '2', '3', '4', '5', '6']) then
begin
@ -579,7 +672,7 @@ procedure TsHTMLReader.TextFoundHandler(AText: String);
begin
if FInCell then
begin
AText := CleanHTMLString(AText);
AText := CleanHTMLString(ConvertEncoding(AText, FEncoding, EncodingUTF8));
if AText <> '' then
begin
if FCellText = '' then