You've already forked lazarus-ccr
fpspreadsheet: Evaluate the colspan and rowspan attributes to merge cells when reading html files
git-svn-id: https://svn.code.sf.net/p/lazarus-ccr/svn@4237 8e941d3f-bd1b-0410-a28a-d453659cc2b4
This commit is contained in:
@ -6,7 +6,7 @@ interface
|
||||
|
||||
uses
|
||||
Classes, SysUtils, fasthtmlparser,
|
||||
fpstypes, fpspreadsheet, fpsReaderWriter;
|
||||
fpstypes, fpspreadsheet, fpsReaderWriter, fpsHTMLUtils;
|
||||
|
||||
type
|
||||
TsHTMLTokenKind = (htkTABLE, htkTR, htkTH, htkTD, htkDIV, htkSPAN, htkP);
|
||||
@ -29,10 +29,13 @@ type
|
||||
FTableCounter: Integer;
|
||||
FCurrRow, FCurrCol: LongInt;
|
||||
FCelLText: String;
|
||||
FAttrList: TsHTMLAttrList;
|
||||
FColSpan, FRowSpan: Integer;
|
||||
procedure ExtractMergedRange;
|
||||
procedure TagFoundHandler(NoCaseTag, ActualTag: string);
|
||||
procedure TextFoundHandler(AText: String);
|
||||
protected
|
||||
procedure ProcessCellValue(ARow, ACol: LongInt; AText: String);
|
||||
procedure AddCell(ARow, ACol: LongInt; AText: String);
|
||||
public
|
||||
constructor Create(AWorkbook: TsWorkbook); override;
|
||||
destructor Destroy; override;
|
||||
@ -113,7 +116,7 @@ implementation
|
||||
|
||||
uses
|
||||
LazUTF8, URIParser, StrUtils,
|
||||
fpsUtils, fpsHTMLUtils, fpsNumFormat;
|
||||
fpsUtils, fpsNumFormat;
|
||||
(*
|
||||
type
|
||||
THTMLEntity = record
|
||||
@ -413,44 +416,17 @@ begin
|
||||
FFormatSettings := HTMLParams.FormatSettings;
|
||||
ReplaceFormatSettings(FFormatSettings, FWorkbook.FormatSettings);
|
||||
FTableCounter := -1;
|
||||
FAttrList := TsHTMLAttrList.Create;
|
||||
end;
|
||||
|
||||
destructor TsHTMLReader.Destroy;
|
||||
begin
|
||||
FreeAndNil(FAttrList);
|
||||
FreeAndNil(parser);
|
||||
inherited Destroy;
|
||||
end;
|
||||
|
||||
procedure TsHTMLReader.ReadFromStream(AStream: TStream);
|
||||
var
|
||||
list: TStringList;
|
||||
begin
|
||||
list := TStringList.Create;
|
||||
try
|
||||
list.LoadFromStream(AStream);
|
||||
ReadFromStrings(list);
|
||||
if FWorkbook.GetWorksheetCount = 0 then
|
||||
begin
|
||||
FWorkbook.AddErrorMsg('Requested table not found, or no tables in html file');
|
||||
FWorkbook.AddWorksheet('Dummy');
|
||||
end;
|
||||
finally
|
||||
list.Free;
|
||||
end;
|
||||
end;
|
||||
|
||||
procedure TsHTMLReader.ReadFromStrings(AStrings: TStrings);
|
||||
begin
|
||||
// Create html parser
|
||||
FreeAndNil(parser);
|
||||
parser := THTMLParser.Create(AStrings.Text);
|
||||
parser.OnFoundTag := @TagFoundHandler;
|
||||
parser.OnFoundText := @TextFoundHandler;
|
||||
// Execute the html parser
|
||||
parser.Exec;
|
||||
end;
|
||||
|
||||
procedure TsHTMLReader.ProcessCellValue(ARow, ACol: LongInt; AText: String);
|
||||
procedure TsHTMLReader.AddCell(ARow, ACol: LongInt; AText: String);
|
||||
var
|
||||
cell: PCell;
|
||||
dblValue: Double;
|
||||
@ -467,6 +443,10 @@ begin
|
||||
|
||||
cell := FWorksheet.AddCell(ARow, ACol);
|
||||
|
||||
// Merged cells
|
||||
if (FColSpan > 0) or (FRowSpan > 0) then
|
||||
FWorksheet.MergeCells(ARow, ACol, ARow + FRowSpan, ACol + FColSpan);
|
||||
|
||||
// Do not try to interpret the strings. --> everything is a LABEL cell.
|
||||
if not HTMLParams.DetectContentType then
|
||||
begin
|
||||
@ -506,6 +486,49 @@ begin
|
||||
FWorksheet.WriteUTF8Text(cell, AText);
|
||||
end;
|
||||
|
||||
procedure TsHTMLReader.ExtractMergedRange;
|
||||
var
|
||||
idx: Integer;
|
||||
begin
|
||||
FColSpan := 0;
|
||||
FRowSpan := 0;
|
||||
idx := FAttrList.IndexOfName('colspan');
|
||||
if idx > -1 then
|
||||
FColSpan := StrToInt(FAttrList[idx].Value) - 1;
|
||||
idx := FAttrList.IndexOfName('rowspan');
|
||||
if idx > -1 then
|
||||
FRowSpan := StrToInt(FAttrList[idx].Value) - 1;
|
||||
// -1 to compensate for correct determination of the range end cell
|
||||
end;
|
||||
|
||||
procedure TsHTMLReader.ReadFromStream(AStream: TStream);
|
||||
var
|
||||
list: TStringList;
|
||||
begin
|
||||
list := TStringList.Create;
|
||||
try
|
||||
list.LoadFromStream(AStream);
|
||||
ReadFromStrings(list);
|
||||
if FWorkbook.GetWorksheetCount = 0 then
|
||||
begin
|
||||
FWorkbook.AddErrorMsg('Requested table not found, or no tables in html file');
|
||||
FWorkbook.AddWorksheet('Dummy');
|
||||
end;
|
||||
finally
|
||||
list.Free;
|
||||
end;
|
||||
end;
|
||||
|
||||
procedure TsHTMLReader.ReadFromStrings(AStrings: TStrings);
|
||||
begin
|
||||
// Create html parser
|
||||
FreeAndNil(parser);
|
||||
parser := THTMLParser.Create(AStrings.Text);
|
||||
parser.OnFoundTag := @TagFoundHandler;
|
||||
parser.OnFoundText := @TextFoundHandler;
|
||||
// Execute the html parser
|
||||
parser.Exec;
|
||||
end;
|
||||
|
||||
procedure TsHTMLReader.TagFoundHandler(NoCaseTag, ActualTag: string);
|
||||
begin
|
||||
@ -537,6 +560,8 @@ begin
|
||||
FInCell := true;
|
||||
inc(FCurrCol);
|
||||
FCellText := '';
|
||||
FAttrList.Parse(ActualTag);
|
||||
ExtractMergedRange;
|
||||
end else
|
||||
if ((NoCaseTag = '<TH>') or (pos('<TH ', NoCaseTag) = 1)) and FInTable then
|
||||
begin
|
||||
@ -567,7 +592,19 @@ begin
|
||||
'</TD>', '</TH>':
|
||||
if FInCell then
|
||||
begin
|
||||
ProcessCellValue(FCurrRow, FCurrCol, FCellText);
|
||||
// inc(FCurrCol);
|
||||
while FWorksheet.isMerged(FWorksheet.FindCell(FCurrRow, FCurrRow)) do
|
||||
inc(FCurrRow);
|
||||
{
|
||||
if FWorksheet.IsMerged(FWorksheet.FindCell(FCurrRow, FCurrCol)) then
|
||||
begin
|
||||
repeat
|
||||
inc(FCurrRow);
|
||||
until not FWorksheet.IsMerged(FWorksheet.FindCell(FCurrRow, FCurrCol));
|
||||
dec(FCurrCol);
|
||||
end;
|
||||
}
|
||||
AddCell(FCurrRow, FCurrCol, FCellText);
|
||||
FInCell := false;
|
||||
end;
|
||||
'</A>':
|
||||
@ -576,12 +613,11 @@ begin
|
||||
if FInCell then FInSpan := false;
|
||||
'<H1/>', '<H2/>', '<H3/>', '<H4/>', '<H5/>', '<H6/>':
|
||||
if FinCell then FInHeader := false;
|
||||
'<TR/>', '<TR />':
|
||||
'<TR/>', '<TR />': // empty rows
|
||||
if FInTable then inc(FCurrRow);
|
||||
'<TD/>', '<TD />':
|
||||
if FInCell then inc(FCurrCol);
|
||||
'<TH/>', '<TH />':
|
||||
if FInCell then inc(FCurrCol);
|
||||
'<TD/>', '<TD />', '<TH/>', '<TH />': // empty cells
|
||||
if FInCell then
|
||||
inc(FCurrCol);
|
||||
end;
|
||||
end;
|
||||
|
||||
|
@ -5,18 +5,34 @@ unit fpsHTMLUtils;
|
||||
interface
|
||||
|
||||
uses
|
||||
Classes, SysUtils;
|
||||
Classes, SysUtils, contnrs;
|
||||
|
||||
type
|
||||
THTMLEntity = record
|
||||
TsHTMLEntity = record
|
||||
E: String;
|
||||
Ch: String;
|
||||
N: Word;
|
||||
end;
|
||||
|
||||
|
||||
function IsHTMLEntity(AText: PChar; out AEntity: THTMLEntity): Boolean;
|
||||
function CleanHTMLString(AText: String): String;
|
||||
function IsHTMLEntity(AText: PChar; out AEntity: TsHTMLEntity): Boolean;
|
||||
|
||||
type
|
||||
TsHTMLAttr = class
|
||||
Name: String;
|
||||
Value: String;
|
||||
constructor Create(AName, AValue: String);
|
||||
end;
|
||||
|
||||
TsHTMLAttrList = class(TObjectList)
|
||||
private
|
||||
function GetItem(AIndex: Integer): TsHTMLAttr;
|
||||
procedure SetItem(AIndex: Integer; AValue: TsHTMLAttr);
|
||||
public
|
||||
function IndexOfName(AName: String): Integer;
|
||||
procedure Parse(AHTML: String);
|
||||
property Items[AIndex: Integer]: TsHTMLAttr read GetItem write SetItem; default;
|
||||
end;
|
||||
|
||||
|
||||
implementation
|
||||
@ -26,7 +42,7 @@ uses
|
||||
|
||||
const
|
||||
// http://unicode.e-workers.de/entities.php
|
||||
HTMLEntities: array[0..250] of THTMLEntity = (
|
||||
HTMLEntities: array[0..250] of TsHTMLEntity = (
|
||||
// A
|
||||
(E: 'Acirc'; Ch: 'Â'; N: 194), // 0
|
||||
(E: 'acirc'; Ch: 'â'; N: 226),
|
||||
@ -305,7 +321,7 @@ const
|
||||
(E: 'zwnj'; Ch: ''; N: 8204) // Zero-width non-joiner
|
||||
);
|
||||
|
||||
function IsHTMLEntity(AText: PChar; out AEntity: THTMLEntity): Boolean;
|
||||
function IsHTMLEntity(AText: PChar; out AEntity: TsHTMLEntity): Boolean;
|
||||
|
||||
function Compare(s: String): Boolean;
|
||||
var
|
||||
@ -357,7 +373,7 @@ end;
|
||||
function CleanHTMLString(AText: String): String;
|
||||
var
|
||||
len: Integer;
|
||||
ent: THTMLEntity;
|
||||
ent: TsHTMLEntity;
|
||||
P: PChar;
|
||||
ch: Char;
|
||||
begin
|
||||
@ -396,5 +412,87 @@ begin
|
||||
end;
|
||||
end;
|
||||
|
||||
|
||||
{==============================================================================}
|
||||
{ TsHTMLAttr }
|
||||
{==============================================================================}
|
||||
|
||||
constructor TsHTMLAttr.Create(AName, AValue: String);
|
||||
begin
|
||||
Name := AName;
|
||||
Value := AValue;
|
||||
end;
|
||||
|
||||
|
||||
{==============================================================================}
|
||||
{ TsHTMLAttrList }
|
||||
{==============================================================================}
|
||||
|
||||
function TsHTMLAttrList.GetItem(AIndex: Integer): TsHTMLAttr;
|
||||
begin
|
||||
Result := TsHTMLAttr(inherited GetItem(AIndex));
|
||||
end;
|
||||
|
||||
function TsHTMLAttrList.IndexOfName(AName: String): Integer;
|
||||
begin
|
||||
AName := Lowercase(AName);
|
||||
for Result := 0 to Count-1 do
|
||||
if GetItem(Result).Name = AName then
|
||||
exit;
|
||||
Result := -1;
|
||||
end;
|
||||
|
||||
{ AHTML is a HTML string beginning with a < tag. Seeks the first space to split
|
||||
off the HTML tag. Then seeks for = and " characters to extract the attributes
|
||||
which are split into name/value pairs at the = character. The value part is
|
||||
unquoted. }
|
||||
procedure TsHTMLAttrList.Parse(AHTML: String);
|
||||
var
|
||||
i: Integer;
|
||||
len: Integer;
|
||||
value, nam: String;
|
||||
begin
|
||||
Clear;
|
||||
if (AHTML[1] <> '<') then // just for simplification
|
||||
raise Exception.Create('[THTMLAttrList.Parse] HTML tags expected.');
|
||||
|
||||
// Find first space
|
||||
i := 1;
|
||||
len := Length(AHTML);
|
||||
while (i <= len) and (AHTML[i] <> ' ') do inc(i);
|
||||
|
||||
// Parse attribute string
|
||||
nam := '';
|
||||
while (i <= len) do
|
||||
begin
|
||||
case AHTML[i] of
|
||||
'=': begin
|
||||
inc(i);
|
||||
if AHTML[i] <> '"' then
|
||||
raise Exception.Create('[THTMLAttrList.Parse] Quotation marks expected.');
|
||||
value := '';
|
||||
inc(i); // skip the initial '"'
|
||||
while (AHTML[i] <> '"') do
|
||||
begin
|
||||
value := value + AHTML[i];
|
||||
inc(i);
|
||||
end;
|
||||
inc(i); // skip the final '"'
|
||||
Add(TsHTMLAttr.Create(lowercase(nam), value));
|
||||
nam := '';
|
||||
end;
|
||||
' ', '/', '>': ;
|
||||
else nam := nam + AHTML[i];
|
||||
end;
|
||||
inc(i);
|
||||
end;
|
||||
end;
|
||||
|
||||
procedure TsHTMLAttrList.SetItem(AIndex: Integer; AValue: TsHTMLAttr);
|
||||
begin
|
||||
inherited SetItem(AIndex, AValue);
|
||||
end;
|
||||
|
||||
|
||||
end.
|
||||
|
||||
|
Reference in New Issue
Block a user