You've already forked lazarus-ccr
fpspreadsheet: Evaluate the colspan and rowspan attributes to merge cells when reading html files
git-svn-id: https://svn.code.sf.net/p/lazarus-ccr/svn@4237 8e941d3f-bd1b-0410-a28a-d453659cc2b4
This commit is contained in:
@ -6,7 +6,7 @@ interface
|
|||||||
|
|
||||||
uses
|
uses
|
||||||
Classes, SysUtils, fasthtmlparser,
|
Classes, SysUtils, fasthtmlparser,
|
||||||
fpstypes, fpspreadsheet, fpsReaderWriter;
|
fpstypes, fpspreadsheet, fpsReaderWriter, fpsHTMLUtils;
|
||||||
|
|
||||||
type
|
type
|
||||||
TsHTMLTokenKind = (htkTABLE, htkTR, htkTH, htkTD, htkDIV, htkSPAN, htkP);
|
TsHTMLTokenKind = (htkTABLE, htkTR, htkTH, htkTD, htkDIV, htkSPAN, htkP);
|
||||||
@ -29,10 +29,13 @@ type
|
|||||||
FTableCounter: Integer;
|
FTableCounter: Integer;
|
||||||
FCurrRow, FCurrCol: LongInt;
|
FCurrRow, FCurrCol: LongInt;
|
||||||
FCelLText: String;
|
FCelLText: String;
|
||||||
|
FAttrList: TsHTMLAttrList;
|
||||||
|
FColSpan, FRowSpan: Integer;
|
||||||
|
procedure ExtractMergedRange;
|
||||||
procedure TagFoundHandler(NoCaseTag, ActualTag: string);
|
procedure TagFoundHandler(NoCaseTag, ActualTag: string);
|
||||||
procedure TextFoundHandler(AText: String);
|
procedure TextFoundHandler(AText: String);
|
||||||
protected
|
protected
|
||||||
procedure ProcessCellValue(ARow, ACol: LongInt; AText: String);
|
procedure AddCell(ARow, ACol: LongInt; AText: String);
|
||||||
public
|
public
|
||||||
constructor Create(AWorkbook: TsWorkbook); override;
|
constructor Create(AWorkbook: TsWorkbook); override;
|
||||||
destructor Destroy; override;
|
destructor Destroy; override;
|
||||||
@ -113,7 +116,7 @@ implementation
|
|||||||
|
|
||||||
uses
|
uses
|
||||||
LazUTF8, URIParser, StrUtils,
|
LazUTF8, URIParser, StrUtils,
|
||||||
fpsUtils, fpsHTMLUtils, fpsNumFormat;
|
fpsUtils, fpsNumFormat;
|
||||||
(*
|
(*
|
||||||
type
|
type
|
||||||
THTMLEntity = record
|
THTMLEntity = record
|
||||||
@ -413,44 +416,17 @@ begin
|
|||||||
FFormatSettings := HTMLParams.FormatSettings;
|
FFormatSettings := HTMLParams.FormatSettings;
|
||||||
ReplaceFormatSettings(FFormatSettings, FWorkbook.FormatSettings);
|
ReplaceFormatSettings(FFormatSettings, FWorkbook.FormatSettings);
|
||||||
FTableCounter := -1;
|
FTableCounter := -1;
|
||||||
|
FAttrList := TsHTMLAttrList.Create;
|
||||||
end;
|
end;
|
||||||
|
|
||||||
destructor TsHTMLReader.Destroy;
|
destructor TsHTMLReader.Destroy;
|
||||||
begin
|
begin
|
||||||
|
FreeAndNil(FAttrList);
|
||||||
FreeAndNil(parser);
|
FreeAndNil(parser);
|
||||||
inherited Destroy;
|
inherited Destroy;
|
||||||
end;
|
end;
|
||||||
|
|
||||||
procedure TsHTMLReader.ReadFromStream(AStream: TStream);
|
procedure TsHTMLReader.AddCell(ARow, ACol: LongInt; AText: String);
|
||||||
var
|
|
||||||
list: TStringList;
|
|
||||||
begin
|
|
||||||
list := TStringList.Create;
|
|
||||||
try
|
|
||||||
list.LoadFromStream(AStream);
|
|
||||||
ReadFromStrings(list);
|
|
||||||
if FWorkbook.GetWorksheetCount = 0 then
|
|
||||||
begin
|
|
||||||
FWorkbook.AddErrorMsg('Requested table not found, or no tables in html file');
|
|
||||||
FWorkbook.AddWorksheet('Dummy');
|
|
||||||
end;
|
|
||||||
finally
|
|
||||||
list.Free;
|
|
||||||
end;
|
|
||||||
end;
|
|
||||||
|
|
||||||
procedure TsHTMLReader.ReadFromStrings(AStrings: TStrings);
|
|
||||||
begin
|
|
||||||
// Create html parser
|
|
||||||
FreeAndNil(parser);
|
|
||||||
parser := THTMLParser.Create(AStrings.Text);
|
|
||||||
parser.OnFoundTag := @TagFoundHandler;
|
|
||||||
parser.OnFoundText := @TextFoundHandler;
|
|
||||||
// Execute the html parser
|
|
||||||
parser.Exec;
|
|
||||||
end;
|
|
||||||
|
|
||||||
procedure TsHTMLReader.ProcessCellValue(ARow, ACol: LongInt; AText: String);
|
|
||||||
var
|
var
|
||||||
cell: PCell;
|
cell: PCell;
|
||||||
dblValue: Double;
|
dblValue: Double;
|
||||||
@ -467,6 +443,10 @@ begin
|
|||||||
|
|
||||||
cell := FWorksheet.AddCell(ARow, ACol);
|
cell := FWorksheet.AddCell(ARow, ACol);
|
||||||
|
|
||||||
|
// Merged cells
|
||||||
|
if (FColSpan > 0) or (FRowSpan > 0) then
|
||||||
|
FWorksheet.MergeCells(ARow, ACol, ARow + FRowSpan, ACol + FColSpan);
|
||||||
|
|
||||||
// Do not try to interpret the strings. --> everything is a LABEL cell.
|
// Do not try to interpret the strings. --> everything is a LABEL cell.
|
||||||
if not HTMLParams.DetectContentType then
|
if not HTMLParams.DetectContentType then
|
||||||
begin
|
begin
|
||||||
@ -506,6 +486,49 @@ begin
|
|||||||
FWorksheet.WriteUTF8Text(cell, AText);
|
FWorksheet.WriteUTF8Text(cell, AText);
|
||||||
end;
|
end;
|
||||||
|
|
||||||
|
procedure TsHTMLReader.ExtractMergedRange;
|
||||||
|
var
|
||||||
|
idx: Integer;
|
||||||
|
begin
|
||||||
|
FColSpan := 0;
|
||||||
|
FRowSpan := 0;
|
||||||
|
idx := FAttrList.IndexOfName('colspan');
|
||||||
|
if idx > -1 then
|
||||||
|
FColSpan := StrToInt(FAttrList[idx].Value) - 1;
|
||||||
|
idx := FAttrList.IndexOfName('rowspan');
|
||||||
|
if idx > -1 then
|
||||||
|
FRowSpan := StrToInt(FAttrList[idx].Value) - 1;
|
||||||
|
// -1 to compensate for correct determination of the range end cell
|
||||||
|
end;
|
||||||
|
|
||||||
|
procedure TsHTMLReader.ReadFromStream(AStream: TStream);
|
||||||
|
var
|
||||||
|
list: TStringList;
|
||||||
|
begin
|
||||||
|
list := TStringList.Create;
|
||||||
|
try
|
||||||
|
list.LoadFromStream(AStream);
|
||||||
|
ReadFromStrings(list);
|
||||||
|
if FWorkbook.GetWorksheetCount = 0 then
|
||||||
|
begin
|
||||||
|
FWorkbook.AddErrorMsg('Requested table not found, or no tables in html file');
|
||||||
|
FWorkbook.AddWorksheet('Dummy');
|
||||||
|
end;
|
||||||
|
finally
|
||||||
|
list.Free;
|
||||||
|
end;
|
||||||
|
end;
|
||||||
|
|
||||||
|
procedure TsHTMLReader.ReadFromStrings(AStrings: TStrings);
|
||||||
|
begin
|
||||||
|
// Create html parser
|
||||||
|
FreeAndNil(parser);
|
||||||
|
parser := THTMLParser.Create(AStrings.Text);
|
||||||
|
parser.OnFoundTag := @TagFoundHandler;
|
||||||
|
parser.OnFoundText := @TextFoundHandler;
|
||||||
|
// Execute the html parser
|
||||||
|
parser.Exec;
|
||||||
|
end;
|
||||||
|
|
||||||
procedure TsHTMLReader.TagFoundHandler(NoCaseTag, ActualTag: string);
|
procedure TsHTMLReader.TagFoundHandler(NoCaseTag, ActualTag: string);
|
||||||
begin
|
begin
|
||||||
@ -537,6 +560,8 @@ begin
|
|||||||
FInCell := true;
|
FInCell := true;
|
||||||
inc(FCurrCol);
|
inc(FCurrCol);
|
||||||
FCellText := '';
|
FCellText := '';
|
||||||
|
FAttrList.Parse(ActualTag);
|
||||||
|
ExtractMergedRange;
|
||||||
end else
|
end else
|
||||||
if ((NoCaseTag = '<TH>') or (pos('<TH ', NoCaseTag) = 1)) and FInTable then
|
if ((NoCaseTag = '<TH>') or (pos('<TH ', NoCaseTag) = 1)) and FInTable then
|
||||||
begin
|
begin
|
||||||
@ -567,7 +592,19 @@ begin
|
|||||||
'</TD>', '</TH>':
|
'</TD>', '</TH>':
|
||||||
if FInCell then
|
if FInCell then
|
||||||
begin
|
begin
|
||||||
ProcessCellValue(FCurrRow, FCurrCol, FCellText);
|
// inc(FCurrCol);
|
||||||
|
while FWorksheet.isMerged(FWorksheet.FindCell(FCurrRow, FCurrRow)) do
|
||||||
|
inc(FCurrRow);
|
||||||
|
{
|
||||||
|
if FWorksheet.IsMerged(FWorksheet.FindCell(FCurrRow, FCurrCol)) then
|
||||||
|
begin
|
||||||
|
repeat
|
||||||
|
inc(FCurrRow);
|
||||||
|
until not FWorksheet.IsMerged(FWorksheet.FindCell(FCurrRow, FCurrCol));
|
||||||
|
dec(FCurrCol);
|
||||||
|
end;
|
||||||
|
}
|
||||||
|
AddCell(FCurrRow, FCurrCol, FCellText);
|
||||||
FInCell := false;
|
FInCell := false;
|
||||||
end;
|
end;
|
||||||
'</A>':
|
'</A>':
|
||||||
@ -576,12 +613,11 @@ begin
|
|||||||
if FInCell then FInSpan := false;
|
if FInCell then FInSpan := false;
|
||||||
'<H1/>', '<H2/>', '<H3/>', '<H4/>', '<H5/>', '<H6/>':
|
'<H1/>', '<H2/>', '<H3/>', '<H4/>', '<H5/>', '<H6/>':
|
||||||
if FinCell then FInHeader := false;
|
if FinCell then FInHeader := false;
|
||||||
'<TR/>', '<TR />':
|
'<TR/>', '<TR />': // empty rows
|
||||||
if FInTable then inc(FCurrRow);
|
if FInTable then inc(FCurrRow);
|
||||||
'<TD/>', '<TD />':
|
'<TD/>', '<TD />', '<TH/>', '<TH />': // empty cells
|
||||||
if FInCell then inc(FCurrCol);
|
if FInCell then
|
||||||
'<TH/>', '<TH />':
|
inc(FCurrCol);
|
||||||
if FInCell then inc(FCurrCol);
|
|
||||||
end;
|
end;
|
||||||
end;
|
end;
|
||||||
|
|
||||||
|
@ -5,18 +5,34 @@ unit fpsHTMLUtils;
|
|||||||
interface
|
interface
|
||||||
|
|
||||||
uses
|
uses
|
||||||
Classes, SysUtils;
|
Classes, SysUtils, contnrs;
|
||||||
|
|
||||||
type
|
type
|
||||||
THTMLEntity = record
|
TsHTMLEntity = record
|
||||||
E: String;
|
E: String;
|
||||||
Ch: String;
|
Ch: String;
|
||||||
N: Word;
|
N: Word;
|
||||||
end;
|
end;
|
||||||
|
|
||||||
|
|
||||||
function IsHTMLEntity(AText: PChar; out AEntity: THTMLEntity): Boolean;
|
|
||||||
function CleanHTMLString(AText: String): String;
|
function CleanHTMLString(AText: String): String;
|
||||||
|
function IsHTMLEntity(AText: PChar; out AEntity: TsHTMLEntity): Boolean;
|
||||||
|
|
||||||
|
type
|
||||||
|
TsHTMLAttr = class
|
||||||
|
Name: String;
|
||||||
|
Value: String;
|
||||||
|
constructor Create(AName, AValue: String);
|
||||||
|
end;
|
||||||
|
|
||||||
|
TsHTMLAttrList = class(TObjectList)
|
||||||
|
private
|
||||||
|
function GetItem(AIndex: Integer): TsHTMLAttr;
|
||||||
|
procedure SetItem(AIndex: Integer; AValue: TsHTMLAttr);
|
||||||
|
public
|
||||||
|
function IndexOfName(AName: String): Integer;
|
||||||
|
procedure Parse(AHTML: String);
|
||||||
|
property Items[AIndex: Integer]: TsHTMLAttr read GetItem write SetItem; default;
|
||||||
|
end;
|
||||||
|
|
||||||
|
|
||||||
implementation
|
implementation
|
||||||
@ -26,7 +42,7 @@ uses
|
|||||||
|
|
||||||
const
|
const
|
||||||
// http://unicode.e-workers.de/entities.php
|
// http://unicode.e-workers.de/entities.php
|
||||||
HTMLEntities: array[0..250] of THTMLEntity = (
|
HTMLEntities: array[0..250] of TsHTMLEntity = (
|
||||||
// A
|
// A
|
||||||
(E: 'Acirc'; Ch: 'Â'; N: 194), // 0
|
(E: 'Acirc'; Ch: 'Â'; N: 194), // 0
|
||||||
(E: 'acirc'; Ch: 'â'; N: 226),
|
(E: 'acirc'; Ch: 'â'; N: 226),
|
||||||
@ -305,7 +321,7 @@ const
|
|||||||
(E: 'zwnj'; Ch: ''; N: 8204) // Zero-width non-joiner
|
(E: 'zwnj'; Ch: ''; N: 8204) // Zero-width non-joiner
|
||||||
);
|
);
|
||||||
|
|
||||||
function IsHTMLEntity(AText: PChar; out AEntity: THTMLEntity): Boolean;
|
function IsHTMLEntity(AText: PChar; out AEntity: TsHTMLEntity): Boolean;
|
||||||
|
|
||||||
function Compare(s: String): Boolean;
|
function Compare(s: String): Boolean;
|
||||||
var
|
var
|
||||||
@ -357,7 +373,7 @@ end;
|
|||||||
function CleanHTMLString(AText: String): String;
|
function CleanHTMLString(AText: String): String;
|
||||||
var
|
var
|
||||||
len: Integer;
|
len: Integer;
|
||||||
ent: THTMLEntity;
|
ent: TsHTMLEntity;
|
||||||
P: PChar;
|
P: PChar;
|
||||||
ch: Char;
|
ch: Char;
|
||||||
begin
|
begin
|
||||||
@ -396,5 +412,87 @@ begin
|
|||||||
end;
|
end;
|
||||||
end;
|
end;
|
||||||
|
|
||||||
|
|
||||||
|
{==============================================================================}
|
||||||
|
{ TsHTMLAttr }
|
||||||
|
{==============================================================================}
|
||||||
|
|
||||||
|
constructor TsHTMLAttr.Create(AName, AValue: String);
|
||||||
|
begin
|
||||||
|
Name := AName;
|
||||||
|
Value := AValue;
|
||||||
|
end;
|
||||||
|
|
||||||
|
|
||||||
|
{==============================================================================}
|
||||||
|
{ TsHTMLAttrList }
|
||||||
|
{==============================================================================}
|
||||||
|
|
||||||
|
function TsHTMLAttrList.GetItem(AIndex: Integer): TsHTMLAttr;
|
||||||
|
begin
|
||||||
|
Result := TsHTMLAttr(inherited GetItem(AIndex));
|
||||||
|
end;
|
||||||
|
|
||||||
|
function TsHTMLAttrList.IndexOfName(AName: String): Integer;
|
||||||
|
begin
|
||||||
|
AName := Lowercase(AName);
|
||||||
|
for Result := 0 to Count-1 do
|
||||||
|
if GetItem(Result).Name = AName then
|
||||||
|
exit;
|
||||||
|
Result := -1;
|
||||||
|
end;
|
||||||
|
|
||||||
|
{ AHTML is a HTML string beginning with a < tag. Seeks the first space to split
|
||||||
|
off the HTML tag. Then seeks for = and " characters to extract the attributes
|
||||||
|
which are split into name/value pairs at the = character. The value part is
|
||||||
|
unquoted. }
|
||||||
|
procedure TsHTMLAttrList.Parse(AHTML: String);
|
||||||
|
var
|
||||||
|
i: Integer;
|
||||||
|
len: Integer;
|
||||||
|
value, nam: String;
|
||||||
|
begin
|
||||||
|
Clear;
|
||||||
|
if (AHTML[1] <> '<') then // just for simplification
|
||||||
|
raise Exception.Create('[THTMLAttrList.Parse] HTML tags expected.');
|
||||||
|
|
||||||
|
// Find first space
|
||||||
|
i := 1;
|
||||||
|
len := Length(AHTML);
|
||||||
|
while (i <= len) and (AHTML[i] <> ' ') do inc(i);
|
||||||
|
|
||||||
|
// Parse attribute string
|
||||||
|
nam := '';
|
||||||
|
while (i <= len) do
|
||||||
|
begin
|
||||||
|
case AHTML[i] of
|
||||||
|
'=': begin
|
||||||
|
inc(i);
|
||||||
|
if AHTML[i] <> '"' then
|
||||||
|
raise Exception.Create('[THTMLAttrList.Parse] Quotation marks expected.');
|
||||||
|
value := '';
|
||||||
|
inc(i); // skip the initial '"'
|
||||||
|
while (AHTML[i] <> '"') do
|
||||||
|
begin
|
||||||
|
value := value + AHTML[i];
|
||||||
|
inc(i);
|
||||||
|
end;
|
||||||
|
inc(i); // skip the final '"'
|
||||||
|
Add(TsHTMLAttr.Create(lowercase(nam), value));
|
||||||
|
nam := '';
|
||||||
|
end;
|
||||||
|
' ', '/', '>': ;
|
||||||
|
else nam := nam + AHTML[i];
|
||||||
|
end;
|
||||||
|
inc(i);
|
||||||
|
end;
|
||||||
|
end;
|
||||||
|
|
||||||
|
procedure TsHTMLAttrList.SetItem(AIndex: Integer; AValue: TsHTMLAttr);
|
||||||
|
begin
|
||||||
|
inherited SetItem(AIndex, AValue);
|
||||||
|
end;
|
||||||
|
|
||||||
|
|
||||||
end.
|
end.
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user