fpspreadsheet: Evaluate the colspan and rowspan attributes to merge cells when reading html files

git-svn-id: https://svn.code.sf.net/p/lazarus-ccr/svn@4237 8e941d3f-bd1b-0410-a28a-d453659cc2b4
This commit is contained in:
wp_xxyyzz
2015-08-02 11:16:31 +00:00
parent 7c8e0a8b3d
commit c5677621dc
2 changed files with 180 additions and 46 deletions

View File

@ -6,7 +6,7 @@ interface
uses
Classes, SysUtils, fasthtmlparser,
fpstypes, fpspreadsheet, fpsReaderWriter;
fpstypes, fpspreadsheet, fpsReaderWriter, fpsHTMLUtils;
type
TsHTMLTokenKind = (htkTABLE, htkTR, htkTH, htkTD, htkDIV, htkSPAN, htkP);
@ -29,10 +29,13 @@ type
FTableCounter: Integer;
FCurrRow, FCurrCol: LongInt;
FCelLText: String;
FAttrList: TsHTMLAttrList;
FColSpan, FRowSpan: Integer;
procedure ExtractMergedRange;
procedure TagFoundHandler(NoCaseTag, ActualTag: string);
procedure TextFoundHandler(AText: String);
protected
procedure ProcessCellValue(ARow, ACol: LongInt; AText: String);
procedure AddCell(ARow, ACol: LongInt; AText: String);
public
constructor Create(AWorkbook: TsWorkbook); override;
destructor Destroy; override;
@ -113,7 +116,7 @@ implementation
uses
LazUTF8, URIParser, StrUtils,
fpsUtils, fpsHTMLUtils, fpsNumFormat;
fpsUtils, fpsNumFormat;
(*
type
THTMLEntity = record
@ -413,44 +416,17 @@ begin
FFormatSettings := HTMLParams.FormatSettings;
ReplaceFormatSettings(FFormatSettings, FWorkbook.FormatSettings);
FTableCounter := -1;
FAttrList := TsHTMLAttrList.Create;
end;
destructor TsHTMLReader.Destroy;
begin
FreeAndNil(FAttrList);
FreeAndNil(parser);
inherited Destroy;
end;
procedure TsHTMLReader.ReadFromStream(AStream: TStream);
var
list: TStringList;
begin
list := TStringList.Create;
try
list.LoadFromStream(AStream);
ReadFromStrings(list);
if FWorkbook.GetWorksheetCount = 0 then
begin
FWorkbook.AddErrorMsg('Requested table not found, or no tables in html file');
FWorkbook.AddWorksheet('Dummy');
end;
finally
list.Free;
end;
end;
procedure TsHTMLReader.ReadFromStrings(AStrings: TStrings);
begin
// Create html parser
FreeAndNil(parser);
parser := THTMLParser.Create(AStrings.Text);
parser.OnFoundTag := @TagFoundHandler;
parser.OnFoundText := @TextFoundHandler;
// Execute the html parser
parser.Exec;
end;
procedure TsHTMLReader.ProcessCellValue(ARow, ACol: LongInt; AText: String);
procedure TsHTMLReader.AddCell(ARow, ACol: LongInt; AText: String);
var
cell: PCell;
dblValue: Double;
@ -467,6 +443,10 @@ begin
cell := FWorksheet.AddCell(ARow, ACol);
// Merged cells
if (FColSpan > 0) or (FRowSpan > 0) then
FWorksheet.MergeCells(ARow, ACol, ARow + FRowSpan, ACol + FColSpan);
// Do not try to interpret the strings. --> everything is a LABEL cell.
if not HTMLParams.DetectContentType then
begin
@ -506,6 +486,49 @@ begin
FWorksheet.WriteUTF8Text(cell, AText);
end;
procedure TsHTMLReader.ExtractMergedRange;
var
idx: Integer;
begin
FColSpan := 0;
FRowSpan := 0;
idx := FAttrList.IndexOfName('colspan');
if idx > -1 then
FColSpan := StrToInt(FAttrList[idx].Value) - 1;
idx := FAttrList.IndexOfName('rowspan');
if idx > -1 then
FRowSpan := StrToInt(FAttrList[idx].Value) - 1;
// -1 to compensate for correct determination of the range end cell
end;
procedure TsHTMLReader.ReadFromStream(AStream: TStream);
var
list: TStringList;
begin
list := TStringList.Create;
try
list.LoadFromStream(AStream);
ReadFromStrings(list);
if FWorkbook.GetWorksheetCount = 0 then
begin
FWorkbook.AddErrorMsg('Requested table not found, or no tables in html file');
FWorkbook.AddWorksheet('Dummy');
end;
finally
list.Free;
end;
end;
procedure TsHTMLReader.ReadFromStrings(AStrings: TStrings);
begin
// Create html parser
FreeAndNil(parser);
parser := THTMLParser.Create(AStrings.Text);
parser.OnFoundTag := @TagFoundHandler;
parser.OnFoundText := @TextFoundHandler;
// Execute the html parser
parser.Exec;
end;
procedure TsHTMLReader.TagFoundHandler(NoCaseTag, ActualTag: string);
begin
@ -537,6 +560,8 @@ begin
FInCell := true;
inc(FCurrCol);
FCellText := '';
FAttrList.Parse(ActualTag);
ExtractMergedRange;
end else
if ((NoCaseTag = '<TH>') or (pos('<TH ', NoCaseTag) = 1)) and FInTable then
begin
@ -567,7 +592,19 @@ begin
'</TD>', '</TH>':
if FInCell then
begin
ProcessCellValue(FCurrRow, FCurrCol, FCellText);
// inc(FCurrCol);
while FWorksheet.isMerged(FWorksheet.FindCell(FCurrRow, FCurrRow)) do
inc(FCurrRow);
{
if FWorksheet.IsMerged(FWorksheet.FindCell(FCurrRow, FCurrCol)) then
begin
repeat
inc(FCurrRow);
until not FWorksheet.IsMerged(FWorksheet.FindCell(FCurrRow, FCurrCol));
dec(FCurrCol);
end;
}
AddCell(FCurrRow, FCurrCol, FCellText);
FInCell := false;
end;
'</A>':
@ -576,12 +613,11 @@ begin
if FInCell then FInSpan := false;
'<H1/>', '<H2/>', '<H3/>', '<H4/>', '<H5/>', '<H6/>':
if FinCell then FInHeader := false;
'<TR/>', '<TR />':
'<TR/>', '<TR />': // empty rows
if FInTable then inc(FCurrRow);
'<TD/>', '<TD />':
if FInCell then inc(FCurrCol);
'<TH/>', '<TH />':
if FInCell then inc(FCurrCol);
'<TD/>', '<TD />', '<TH/>', '<TH />': // empty cells
if FInCell then
inc(FCurrCol);
end;
end;

View File

@ -5,18 +5,34 @@ unit fpsHTMLUtils;
interface
uses
Classes, SysUtils;
Classes, SysUtils, contnrs;
type
THTMLEntity = record
TsHTMLEntity = record
E: String;
Ch: String;
N: Word;
end;
function IsHTMLEntity(AText: PChar; out AEntity: THTMLEntity): Boolean;
function CleanHTMLString(AText: String): String;
function IsHTMLEntity(AText: PChar; out AEntity: TsHTMLEntity): Boolean;
type
TsHTMLAttr = class
Name: String;
Value: String;
constructor Create(AName, AValue: String);
end;
TsHTMLAttrList = class(TObjectList)
private
function GetItem(AIndex: Integer): TsHTMLAttr;
procedure SetItem(AIndex: Integer; AValue: TsHTMLAttr);
public
function IndexOfName(AName: String): Integer;
procedure Parse(AHTML: String);
property Items[AIndex: Integer]: TsHTMLAttr read GetItem write SetItem; default;
end;
implementation
@ -26,7 +42,7 @@ uses
const
// http://unicode.e-workers.de/entities.php
HTMLEntities: array[0..250] of THTMLEntity = (
HTMLEntities: array[0..250] of TsHTMLEntity = (
// A
(E: 'Acirc'; Ch: 'Â'; N: 194), // 0
(E: 'acirc'; Ch: 'â'; N: 226),
@ -305,7 +321,7 @@ const
(E: 'zwnj'; Ch: ''; N: 8204) // Zero-width non-joiner
);
function IsHTMLEntity(AText: PChar; out AEntity: THTMLEntity): Boolean;
function IsHTMLEntity(AText: PChar; out AEntity: TsHTMLEntity): Boolean;
function Compare(s: String): Boolean;
var
@ -357,7 +373,7 @@ end;
function CleanHTMLString(AText: String): String;
var
len: Integer;
ent: THTMLEntity;
ent: TsHTMLEntity;
P: PChar;
ch: Char;
begin
@ -396,5 +412,87 @@ begin
end;
end;
{==============================================================================}
{ TsHTMLAttr }
{==============================================================================}
constructor TsHTMLAttr.Create(AName, AValue: String);
begin
Name := AName;
Value := AValue;
end;
{==============================================================================}
{ TsHTMLAttrList }
{==============================================================================}
function TsHTMLAttrList.GetItem(AIndex: Integer): TsHTMLAttr;
begin
Result := TsHTMLAttr(inherited GetItem(AIndex));
end;
function TsHTMLAttrList.IndexOfName(AName: String): Integer;
begin
AName := Lowercase(AName);
for Result := 0 to Count-1 do
if GetItem(Result).Name = AName then
exit;
Result := -1;
end;
{ AHTML is a HTML string beginning with a < tag. Seeks the first space to split
off the HTML tag. Then seeks for = and " characters to extract the attributes
which are split into name/value pairs at the = character. The value part is
unquoted. }
procedure TsHTMLAttrList.Parse(AHTML: String);
var
i: Integer;
len: Integer;
value, nam: String;
begin
Clear;
if (AHTML[1] <> '<') then // just for simplification
raise Exception.Create('[THTMLAttrList.Parse] HTML tags expected.');
// Find first space
i := 1;
len := Length(AHTML);
while (i <= len) and (AHTML[i] <> ' ') do inc(i);
// Parse attribute string
nam := '';
while (i <= len) do
begin
case AHTML[i] of
'=': begin
inc(i);
if AHTML[i] <> '"' then
raise Exception.Create('[THTMLAttrList.Parse] Quotation marks expected.');
value := '';
inc(i); // skip the initial '"'
while (AHTML[i] <> '"') do
begin
value := value + AHTML[i];
inc(i);
end;
inc(i); // skip the final '"'
Add(TsHTMLAttr.Create(lowercase(nam), value));
nam := '';
end;
' ', '/', '>': ;
else nam := nam + AHTML[i];
end;
inc(i);
end;
end;
procedure TsHTMLAttrList.SetItem(AIndex: Integer; AValue: TsHTMLAttr);
begin
inherited SetItem(AIndex, AValue);
end;
end.