You've already forked lazarus-ccr
fpspreadsheet: HTML reader respects character encoding of file
git-svn-id: https://svn.code.sf.net/p/lazarus-ccr/svn@4256 8e941d3f-bd1b-0410-a28a-d453659cc2b4
This commit is contained in:
@ -27,6 +27,7 @@ type
|
|||||||
FInSpan: Boolean;
|
FInSpan: Boolean;
|
||||||
FInA: Boolean;
|
FInA: Boolean;
|
||||||
FInHeader: Boolean;
|
FInHeader: Boolean;
|
||||||
|
FEncoding: String;
|
||||||
FTableCounter: Integer;
|
FTableCounter: Integer;
|
||||||
FCurrRow, FCurrCol: LongInt;
|
FCurrRow, FCurrCol: LongInt;
|
||||||
FCurrCellFormat: TsCellFormat;
|
FCurrCellFormat: TsCellFormat;
|
||||||
@ -36,12 +37,13 @@ type
|
|||||||
FAttrList: TsHTMLAttrList;
|
FAttrList: TsHTMLAttrList;
|
||||||
FColSpan, FRowSpan: Integer;
|
FColSpan, FRowSpan: Integer;
|
||||||
FHRef: String;
|
FHRef: String;
|
||||||
procedure ExtractBackgroundColor;
|
procedure ReadBackgroundColor;
|
||||||
procedure ExtractFont(AFont: TsFont);
|
procedure ReadEncoding;
|
||||||
procedure ExtractHRef;
|
procedure ReadFont(AFont: TsFont);
|
||||||
procedure ExtractHorAlign;
|
procedure ReadHRef;
|
||||||
procedure ExtractMergedRange;
|
procedure ReadHorAlign;
|
||||||
procedure ExtractVertAlign;
|
procedure ReadMergedRange;
|
||||||
|
procedure ReadVertAlign;
|
||||||
procedure InitFont(AFont: TsFont);
|
procedure InitFont(AFont: TsFont);
|
||||||
procedure InitCellFormat;
|
procedure InitCellFormat;
|
||||||
procedure TagFoundHandler(NoCaseTag, ActualTag: string);
|
procedure TagFoundHandler(NoCaseTag, ActualTag: string);
|
||||||
@ -127,7 +129,7 @@ var
|
|||||||
implementation
|
implementation
|
||||||
|
|
||||||
uses
|
uses
|
||||||
LazUTF8, URIParser, StrUtils, Math,
|
LConvEncoding, LazUTF8, URIParser, StrUtils, Math,
|
||||||
fpsUtils, fpsNumFormat;
|
fpsUtils, fpsNumFormat;
|
||||||
|
|
||||||
const
|
const
|
||||||
@ -140,6 +142,8 @@ const
|
|||||||
constructor TsHTMLReader.Create(AWorkbook: TsWorkbook);
|
constructor TsHTMLReader.Create(AWorkbook: TsWorkbook);
|
||||||
begin
|
begin
|
||||||
inherited Create(AWorkbook);
|
inherited Create(AWorkbook);
|
||||||
|
FEncoding := EncodingUTF8;
|
||||||
|
|
||||||
FFormatSettings := HTMLParams.FormatSettings;
|
FFormatSettings := HTMLParams.FormatSettings;
|
||||||
ReplaceFormatSettings(FFormatSettings, FWorkbook.FormatSettings);
|
ReplaceFormatSettings(FFormatSettings, FWorkbook.FormatSettings);
|
||||||
|
|
||||||
@ -244,7 +248,7 @@ begin
|
|||||||
FWorksheet.WriteUTF8Text(cell, AText);
|
FWorksheet.WriteUTF8Text(cell, AText);
|
||||||
end;
|
end;
|
||||||
|
|
||||||
procedure TsHTMLReader.ExtractBackgroundColor;
|
procedure TsHTMLReader.ReadBackgroundColor;
|
||||||
var
|
var
|
||||||
idx: Integer;
|
idx: Integer;
|
||||||
begin
|
begin
|
||||||
@ -260,7 +264,91 @@ begin
|
|||||||
end;
|
end;
|
||||||
end;
|
end;
|
||||||
|
|
||||||
procedure TsHTMLReader.ExtractFont(AFont: TsFont);
|
procedure TsHTMLReader.ReadEncoding;
|
||||||
|
|
||||||
|
function FoundEncoding(AString: string): Boolean;
|
||||||
|
// https://encoding.spec.whatwg.org/#encodings
|
||||||
|
begin
|
||||||
|
Result := true;
|
||||||
|
case AString of
|
||||||
|
'utf-8', 'utf8', 'unicode-1-1-utf-8':
|
||||||
|
FEncoding := 'utf8';
|
||||||
|
'cp1250', 'windows-1250', 'x-cp1250':
|
||||||
|
FEncoding := 'cp1250';
|
||||||
|
'cp1251', 'windows-1251', 'x-cp1251':
|
||||||
|
FEncoding := 'cp1251';
|
||||||
|
'ansi_x3.4-1968', 'ascii', 'cp1252', 'cp819', 'csisolatin1', 'ibm819',
|
||||||
|
'iso-8859-1', 'iso-ir-100', 'iso8859-1', 'iso88591', 'iso_8859-1',
|
||||||
|
'iso_8859-1:1987', 'l1', 'latin1', 'us-ascii', 'windows-1252', 'x-cp1252':
|
||||||
|
FEncoding := 'cp1252';
|
||||||
|
'cp1253', 'windows-1253', 'x-cp1253':
|
||||||
|
FEncoding := 'cp1253';
|
||||||
|
'cp1254', 'csisolatin5', 'iso-8859-9', 'iso-ir-148', 'iso8859-9', 'iso88599',
|
||||||
|
'iso_8859-9', 'iso_8859-9:1989', 'l5', 'latin5', 'windows-1254', 'x-cp1254':
|
||||||
|
FEncoding := 'cp1254';
|
||||||
|
'cp1255', 'windows-1255', 'x-cp1255':
|
||||||
|
FEncoding := 'cp1255';
|
||||||
|
'cp1256', 'windows-1256', 'x-cp1256':
|
||||||
|
FEncoding := 'cp1256';
|
||||||
|
'cp1257', 'windows-1257', 'x-cp1257':
|
||||||
|
FEncoding := 'cp1257';
|
||||||
|
'cp1258', 'windows-1258', 'x-cp1258':
|
||||||
|
FEncoding := 'cp1258';
|
||||||
|
'866', 'cp866', 'csibm866', 'ibm866':
|
||||||
|
FEncoding := 'cp866';
|
||||||
|
'dos-874', 'iso-8859-11', 'iso8859-11', 'iso885911', 'tis-620', 'windows-874':
|
||||||
|
FEncoding := 'cp874';
|
||||||
|
'csisolatin2', 'iso-8859-2', 'iso-ir-101', 'iso8859-2', 'iso88592',
|
||||||
|
'iso_8859-2', 'iso_8859-2:1987', 'l2', 'latin2':
|
||||||
|
FEncoding := 'cpiso88592';
|
||||||
|
'csisolatin9', 'iso-8859-15', 'iso8859-15', 'iso885915', 'iso_8859-15', 'l9':
|
||||||
|
FEncoding := 'cpiso885915';
|
||||||
|
'csmacintosh', 'mac', 'macintosh', 'x-mac-roman':
|
||||||
|
FEncoding := 'mactintosh';
|
||||||
|
'cskoi8r', 'koi', 'koi8', 'koi8-r', 'koi8_r':
|
||||||
|
FEncoding := 'koi8';
|
||||||
|
'utf-16be':
|
||||||
|
FEncoding := 'ucs2be';
|
||||||
|
'utf-16', 'utf-16le':
|
||||||
|
FEncoding := 'ucs2le';
|
||||||
|
else
|
||||||
|
// Above site notes also some asian code pages which are not supported by
|
||||||
|
// Lazarus encoding utilities.
|
||||||
|
Result := false;
|
||||||
|
end;
|
||||||
|
end;
|
||||||
|
|
||||||
|
var
|
||||||
|
idx, p: Integer;
|
||||||
|
s: String;
|
||||||
|
begin
|
||||||
|
// HTML 5
|
||||||
|
idx := FAttrList.IndexOfName('charset');
|
||||||
|
if idx > -1 then begin
|
||||||
|
s := Lowercase(FAttrList[idx].Value);
|
||||||
|
if (s <> '') and FoundEncoding(s) then exit;
|
||||||
|
end;
|
||||||
|
|
||||||
|
// HTML 4
|
||||||
|
idx := FAttrList.IndexOfName('http-equiv');
|
||||||
|
if idx > -1 then
|
||||||
|
begin
|
||||||
|
idx := FAttrList.IndexOfName('content');
|
||||||
|
if idx > -1 then begin
|
||||||
|
s := Lowercase(FAttrList[idx].Value);
|
||||||
|
p := pos('charset', s);
|
||||||
|
if p > 0 then
|
||||||
|
begin
|
||||||
|
s := copy(s, p+Length('charset')+1, MaxInt);
|
||||||
|
p := pos(';', s);
|
||||||
|
if p > 0 then s := copy(s, 1, p-1);
|
||||||
|
if (s <> '') and FoundEncoding(s) then exit;
|
||||||
|
end;
|
||||||
|
end;
|
||||||
|
end;
|
||||||
|
end;
|
||||||
|
|
||||||
|
procedure TsHTMLReader.ReadFont(AFont: TsFont);
|
||||||
const
|
const
|
||||||
Factor = 1.2;
|
Factor = 1.2;
|
||||||
var
|
var
|
||||||
@ -350,7 +438,7 @@ begin
|
|||||||
AFont.Color := HTMLColorStrToColor(FAttrList[idx].Value);
|
AFont.Color := HTMLColorStrToColor(FAttrList[idx].Value);
|
||||||
end;
|
end;
|
||||||
|
|
||||||
procedure TsHTMLReader.ExtractHorAlign;
|
procedure TsHTMLReader.ReadHorAlign;
|
||||||
var
|
var
|
||||||
idx: Integer;
|
idx: Integer;
|
||||||
s: String;
|
s: String;
|
||||||
@ -373,7 +461,7 @@ begin
|
|||||||
end;
|
end;
|
||||||
end;
|
end;
|
||||||
|
|
||||||
procedure TsHTMLReader.ExtractHRef;
|
procedure TsHTMLReader.ReadHRef;
|
||||||
var
|
var
|
||||||
idx: Integer;
|
idx: Integer;
|
||||||
begin
|
begin
|
||||||
@ -383,7 +471,7 @@ begin
|
|||||||
FHRef := FAttrList[idx].Value;
|
FHRef := FAttrList[idx].Value;
|
||||||
end;
|
end;
|
||||||
|
|
||||||
procedure TsHTMLReader.ExtractMergedRange;
|
procedure TsHTMLReader.ReadMergedRange;
|
||||||
var
|
var
|
||||||
idx: Integer;
|
idx: Integer;
|
||||||
begin
|
begin
|
||||||
@ -398,7 +486,7 @@ begin
|
|||||||
// -1 to compensate for correct determination of the range end cell
|
// -1 to compensate for correct determination of the range end cell
|
||||||
end;
|
end;
|
||||||
|
|
||||||
procedure TsHTMLReader.ExtractVertAlign;
|
procedure TsHTMLReader.ReadVertAlign;
|
||||||
var
|
var
|
||||||
idx: Integer;
|
idx: Integer;
|
||||||
s: String;
|
s: String;
|
||||||
@ -474,6 +562,11 @@ end;
|
|||||||
|
|
||||||
procedure TsHTMLReader.TagFoundHandler(NoCaseTag, ActualTag: string);
|
procedure TsHTMLReader.TagFoundHandler(NoCaseTag, ActualTag: string);
|
||||||
begin
|
begin
|
||||||
|
if pos('<META', NoCaseTag) = 1 then
|
||||||
|
begin
|
||||||
|
FAttrList.Parse(ActualTag);
|
||||||
|
ReadEncoding;
|
||||||
|
end else
|
||||||
if pos('<TABLE', NoCaseTag) = 1 then
|
if pos('<TABLE', NoCaseTag) = 1 then
|
||||||
begin
|
begin
|
||||||
inc(FTableCounter);
|
inc(FTableCounter);
|
||||||
@ -485,7 +578,7 @@ begin
|
|||||||
FCurrCol := -1;
|
FCurrCol := -1;
|
||||||
InitFont(FCurrFont);
|
InitFont(FCurrFont);
|
||||||
FAttrList.Parse(ActualTag);
|
FAttrList.Parse(ActualTag);
|
||||||
ExtractFont(FCurrFont);
|
ReadFont(FCurrFont);
|
||||||
FWorkbook.ReplaceFont(DEFAULT_FONTINDEX, FCurrFont.FontName, FCurrFont.Size,
|
FWorkbook.ReplaceFont(DEFAULT_FONTINDEX, FCurrFont.FontName, FCurrFont.Size,
|
||||||
FCurrFont.Style, FCurrFont.Color, FCurrFont.Position);
|
FCurrFont.Style, FCurrFont.Color, FCurrFont.Position);
|
||||||
end else
|
end else
|
||||||
@ -509,11 +602,11 @@ begin
|
|||||||
FCellText := '';
|
FCellText := '';
|
||||||
InitCellFormat;
|
InitCellFormat;
|
||||||
FAttrList.Parse(ActualTag);
|
FAttrList.Parse(ActualTag);
|
||||||
ExtractMergedRange;
|
ReadMergedRange;
|
||||||
ExtractBackgroundColor;
|
ReadBackgroundColor;
|
||||||
ExtractHorAlign;
|
ReadHorAlign;
|
||||||
ExtractVertAlign;
|
ReadVertAlign;
|
||||||
ExtractFont(FCellFont);
|
ReadFont(FCellFont);
|
||||||
end else
|
end else
|
||||||
if ((NoCaseTag = '<TH>') or (pos('<TH ', NoCaseTag) = 1)) and FInTable then
|
if ((NoCaseTag = '<TH>') or (pos('<TH ', NoCaseTag) = 1)) and FInTable then
|
||||||
begin
|
begin
|
||||||
@ -530,7 +623,7 @@ begin
|
|||||||
begin
|
begin
|
||||||
FInA := true;
|
FInA := true;
|
||||||
FAttrList.Parse(ActualTag);
|
FAttrList.Parse(ActualTag);
|
||||||
ExtractHRef;
|
ReadHRef;
|
||||||
end else
|
end else
|
||||||
if (pos('<H', NoCaseTag) = 1) and (NoCaseTag[3] in ['1', '2', '3', '4', '5', '6']) then
|
if (pos('<H', NoCaseTag) = 1) and (NoCaseTag[3] in ['1', '2', '3', '4', '5', '6']) then
|
||||||
begin
|
begin
|
||||||
@ -579,7 +672,7 @@ procedure TsHTMLReader.TextFoundHandler(AText: String);
|
|||||||
begin
|
begin
|
||||||
if FInCell then
|
if FInCell then
|
||||||
begin
|
begin
|
||||||
AText := CleanHTMLString(AText);
|
AText := CleanHTMLString(ConvertEncoding(AText, FEncoding, EncodingUTF8));
|
||||||
if AText <> '' then
|
if AText <> '' then
|
||||||
begin
|
begin
|
||||||
if FCellText = '' then
|
if FCellText = '' then
|
||||||
|
Reference in New Issue
Block a user