From c5677621dc15cb7de1387ddf85408d6d2e1ac5b2 Mon Sep 17 00:00:00 2001 From: wp_xxyyzz Date: Sun, 2 Aug 2015 11:16:31 +0000 Subject: [PATCH] fpspreadsheet: Evaluate the colspan and rowspan attributes to merge cells when reading html files git-svn-id: https://svn.code.sf.net/p/lazarus-ccr/svn@4237 8e941d3f-bd1b-0410-a28a-d453659cc2b4 --- components/fpspreadsheet/fpshtml.pas | 114 ++++++++++++++-------- components/fpspreadsheet/fpshtmlutils.pas | 112 +++++++++++++++++++-- 2 files changed, 180 insertions(+), 46 deletions(-) diff --git a/components/fpspreadsheet/fpshtml.pas b/components/fpspreadsheet/fpshtml.pas index 4c2a88a20..b542e4401 100644 --- a/components/fpspreadsheet/fpshtml.pas +++ b/components/fpspreadsheet/fpshtml.pas @@ -6,7 +6,7 @@ interface uses Classes, SysUtils, fasthtmlparser, - fpstypes, fpspreadsheet, fpsReaderWriter; + fpstypes, fpspreadsheet, fpsReaderWriter, fpsHTMLUtils; type TsHTMLTokenKind = (htkTABLE, htkTR, htkTH, htkTD, htkDIV, htkSPAN, htkP); @@ -29,10 +29,13 @@ type FTableCounter: Integer; FCurrRow, FCurrCol: LongInt; FCelLText: String; + FAttrList: TsHTMLAttrList; + FColSpan, FRowSpan: Integer; + procedure ExtractMergedRange; procedure TagFoundHandler(NoCaseTag, ActualTag: string); procedure TextFoundHandler(AText: String); protected - procedure ProcessCellValue(ARow, ACol: LongInt; AText: String); + procedure AddCell(ARow, ACol: LongInt; AText: String); public constructor Create(AWorkbook: TsWorkbook); override; destructor Destroy; override; @@ -113,7 +116,7 @@ implementation uses LazUTF8, URIParser, StrUtils, - fpsUtils, fpsHTMLUtils, fpsNumFormat; + fpsUtils, fpsNumFormat; (* type THTMLEntity = record @@ -413,44 +416,17 @@ begin FFormatSettings := HTMLParams.FormatSettings; ReplaceFormatSettings(FFormatSettings, FWorkbook.FormatSettings); FTableCounter := -1; + FAttrList := TsHTMLAttrList.Create; end; destructor TsHTMLReader.Destroy; begin + FreeAndNil(FAttrList); FreeAndNil(parser); inherited Destroy; end; -procedure TsHTMLReader.ReadFromStream(AStream: TStream); -var - list: TStringList; -begin - list := TStringList.Create; - try - list.LoadFromStream(AStream); - ReadFromStrings(list); - if FWorkbook.GetWorksheetCount = 0 then - begin - FWorkbook.AddErrorMsg('Requested table not found, or no tables in html file'); - FWorkbook.AddWorksheet('Dummy'); - end; - finally - list.Free; - end; -end; - -procedure TsHTMLReader.ReadFromStrings(AStrings: TStrings); -begin - // Create html parser - FreeAndNil(parser); - parser := THTMLParser.Create(AStrings.Text); - parser.OnFoundTag := @TagFoundHandler; - parser.OnFoundText := @TextFoundHandler; - // Execute the html parser - parser.Exec; -end; - -procedure TsHTMLReader.ProcessCellValue(ARow, ACol: LongInt; AText: String); +procedure TsHTMLReader.AddCell(ARow, ACol: LongInt; AText: String); var cell: PCell; dblValue: Double; @@ -467,6 +443,10 @@ begin cell := FWorksheet.AddCell(ARow, ACol); + // Merged cells + if (FColSpan > 0) or (FRowSpan > 0) then + FWorksheet.MergeCells(ARow, ACol, ARow + FRowSpan, ACol + FColSpan); + // Do not try to interpret the strings. --> everything is a LABEL cell. if not HTMLParams.DetectContentType then begin @@ -506,6 +486,49 @@ begin FWorksheet.WriteUTF8Text(cell, AText); end; +procedure TsHTMLReader.ExtractMergedRange; +var + idx: Integer; +begin + FColSpan := 0; + FRowSpan := 0; + idx := FAttrList.IndexOfName('colspan'); + if idx > -1 then + FColSpan := StrToInt(FAttrList[idx].Value) - 1; + idx := FAttrList.IndexOfName('rowspan'); + if idx > -1 then + FRowSpan := StrToInt(FAttrList[idx].Value) - 1; + // -1 to compensate for correct determination of the range end cell +end; + +procedure TsHTMLReader.ReadFromStream(AStream: TStream); +var + list: TStringList; +begin + list := TStringList.Create; + try + list.LoadFromStream(AStream); + ReadFromStrings(list); + if FWorkbook.GetWorksheetCount = 0 then + begin + FWorkbook.AddErrorMsg('Requested table not found, or no tables in html file'); + FWorkbook.AddWorksheet('Dummy'); + end; + finally + list.Free; + end; +end; + +procedure TsHTMLReader.ReadFromStrings(AStrings: TStrings); +begin + // Create html parser + FreeAndNil(parser); + parser := THTMLParser.Create(AStrings.Text); + parser.OnFoundTag := @TagFoundHandler; + parser.OnFoundText := @TextFoundHandler; + // Execute the html parser + parser.Exec; +end; procedure TsHTMLReader.TagFoundHandler(NoCaseTag, ActualTag: string); begin @@ -537,6 +560,8 @@ begin FInCell := true; inc(FCurrCol); FCellText := ''; + FAttrList.Parse(ActualTag); + ExtractMergedRange; end else if ((NoCaseTag = '') or (pos('', '': if FInCell then begin - ProcessCellValue(FCurrRow, FCurrCol, FCellText); +// inc(FCurrCol); + while FWorksheet.isMerged(FWorksheet.FindCell(FCurrRow, FCurrRow)) do + inc(FCurrRow); + { + if FWorksheet.IsMerged(FWorksheet.FindCell(FCurrRow, FCurrCol)) then + begin + repeat + inc(FCurrRow); + until not FWorksheet.IsMerged(FWorksheet.FindCell(FCurrRow, FCurrCol)); + dec(FCurrCol); + end; + } + AddCell(FCurrRow, FCurrCol, FCellText); FInCell := false; end; '': @@ -576,12 +613,11 @@ begin if FInCell then FInSpan := false; '

', '

', '

', '

', '

', '
': if FinCell then FInHeader := false; - '', '': + '', '': // empty rows if FInTable then inc(FCurrRow); - '', '': - if FInCell then inc(FCurrCol); - '', '': - if FInCell then inc(FCurrCol); + '', '', '', '': // empty cells + if FInCell then + inc(FCurrCol); end; end; diff --git a/components/fpspreadsheet/fpshtmlutils.pas b/components/fpspreadsheet/fpshtmlutils.pas index 1fef4655f..6e560ae24 100644 --- a/components/fpspreadsheet/fpshtmlutils.pas +++ b/components/fpspreadsheet/fpshtmlutils.pas @@ -5,18 +5,34 @@ unit fpsHTMLUtils; interface uses - Classes, SysUtils; + Classes, SysUtils, contnrs; type - THTMLEntity = record + TsHTMLEntity = record E: String; Ch: String; N: Word; end; - -function IsHTMLEntity(AText: PChar; out AEntity: THTMLEntity): Boolean; function CleanHTMLString(AText: String): String; +function IsHTMLEntity(AText: PChar; out AEntity: TsHTMLEntity): Boolean; + +type + TsHTMLAttr = class + Name: String; + Value: String; + constructor Create(AName, AValue: String); + end; + + TsHTMLAttrList = class(TObjectList) + private + function GetItem(AIndex: Integer): TsHTMLAttr; + procedure SetItem(AIndex: Integer; AValue: TsHTMLAttr); + public + function IndexOfName(AName: String): Integer; + procedure Parse(AHTML: String); + property Items[AIndex: Integer]: TsHTMLAttr read GetItem write SetItem; default; + end; implementation @@ -26,7 +42,7 @@ uses const // http://unicode.e-workers.de/entities.php - HTMLEntities: array[0..250] of THTMLEntity = ( + HTMLEntities: array[0..250] of TsHTMLEntity = ( // A (E: 'Acirc'; Ch: 'Â'; N: 194), // 0 (E: 'acirc'; Ch: 'â'; N: 226), @@ -305,7 +321,7 @@ const (E: 'zwnj'; Ch: ''; N: 8204) // Zero-width non-joiner ); -function IsHTMLEntity(AText: PChar; out AEntity: THTMLEntity): Boolean; +function IsHTMLEntity(AText: PChar; out AEntity: TsHTMLEntity): Boolean; function Compare(s: String): Boolean; var @@ -357,7 +373,7 @@ end; function CleanHTMLString(AText: String): String; var len: Integer; - ent: THTMLEntity; + ent: TsHTMLEntity; P: PChar; ch: Char; begin @@ -396,5 +412,87 @@ begin end; end; + +{==============================================================================} +{ TsHTMLAttr } +{==============================================================================} + +constructor TsHTMLAttr.Create(AName, AValue: String); +begin + Name := AName; + Value := AValue; +end; + + +{==============================================================================} +{ TsHTMLAttrList } +{==============================================================================} + +function TsHTMLAttrList.GetItem(AIndex: Integer): TsHTMLAttr; +begin + Result := TsHTMLAttr(inherited GetItem(AIndex)); +end; + +function TsHTMLAttrList.IndexOfName(AName: String): Integer; +begin + AName := Lowercase(AName); + for Result := 0 to Count-1 do + if GetItem(Result).Name = AName then + exit; + Result := -1; +end; + +{ AHTML is a HTML string beginning with a < tag. Seeks the first space to split + off the HTML tag. Then seeks for = and " characters to extract the attributes + which are split into name/value pairs at the = character. The value part is + unquoted. } +procedure TsHTMLAttrList.Parse(AHTML: String); +var + i: Integer; + len: Integer; + value, nam: String; +begin + Clear; + if (AHTML[1] <> '<') then // just for simplification + raise Exception.Create('[THTMLAttrList.Parse] HTML tags expected.'); + + // Find first space + i := 1; + len := Length(AHTML); + while (i <= len) and (AHTML[i] <> ' ') do inc(i); + + // Parse attribute string + nam := ''; + while (i <= len) do + begin + case AHTML[i] of + '=': begin + inc(i); + if AHTML[i] <> '"' then + raise Exception.Create('[THTMLAttrList.Parse] Quotation marks expected.'); + value := ''; + inc(i); // skip the initial '"' + while (AHTML[i] <> '"') do + begin + value := value + AHTML[i]; + inc(i); + end; + inc(i); // skip the final '"' + Add(TsHTMLAttr.Create(lowercase(nam), value)); + nam := ''; + end; + ' ', '/', '>': ; + else nam := nam + AHTML[i]; + end; + inc(i); + end; +end; + +procedure TsHTMLAttrList.SetItem(AIndex: Integer; AValue: TsHTMLAttr); +begin + inherited SetItem(AIndex, AValue); +end; + + end.