fpspreadsheet: Simplified solution for multi-line text issue #39045 fixing also malformed UTF8 strings for ods.

git-svn-id: https://svn.code.sf.net/p/lazarus-ccr/svn@8568 8e941d3f-bd1b-0410-a28a-d453659cc2b4
2022-10-19 10:09:16 +00:00
parent 6036db7d88
commit df2c182ee2
3 changed files with 69 additions and 101 deletions
--- a/components/fpspreadsheet/source/common/fpsopendocument.pas
+++ b/components/fpspreadsheet/source/common/fpsopendocument.pas
@@ -9207,58 +9207,31 @@ var
  i, idx, endidx, fntidx, len: Integer;
  rtParam: TsRichTextParam;
  sheet: TsWorksheet;
-  {$IFDEF FPS_NO_LAZUNICODE}
-  ch: WideChar;
-  wideStr, txt: WideString;
-  {$ELSE}
-  ch: String;
-  chArray: array of string = nil;
  txt: String;
-  {$ENDIF}
+  chArray: array of string = nil;

  function IsNewLine(var idx: integer): Boolean;
  begin
-   {$IFDEF FPS_NO_LAZUNICODE}
-    if (wideStr[idx] = #13) then
-    begin
-      Result := true;
-      if (idx < len) and (widestr[idx+1] = #10) then inc(idx);
-    end else
-    if (wideStr[idx] = #10) then
-    begin
-      Result := true;
-      if (idx < len) and (widestr[idx+1] = #13) then inc(idx);
-    end else
-      Result := false;
-   {$ELSE}
    Result := true;
    case chArray[idx] of
-      #13: if (chArray[idx+1] = #10) then inc(idx);  // Cannot overrun because we had appended a #0 to string
-      #10: if (chArray[idx+1] = #13) then inc(idx);  // Cannot overrun because we had appended a #0 to string
+      #13: if (idx < len-1) and (chArray[idx+1] = #10) then inc(idx);
+      #10: if (idx < len-1) and (chArray[idx+1] = #13) then inc(idx);
      else Result := false;
    end;
-   {$ENDIF}
  end;

  procedure AppendTxt(NewLine: Boolean; FntStyle: String);
-  var
-    s: String;
  begin
-    {$IFDEF FPS_NO_LAZUNICODE}
-    s := UTF8Encode(txt);
-    {$ELSE}
-    s := txt;
-    {$ENDIF}
-    ValidXMLText(s);
-    if NewLine and (s = '') then
+    ValidXMLText(txt);
+    if NewLine and (txt = '') then
      totaltxt := totaltxt + '</text:p><text:p>'
    else
    begin
      if FntStyle = '' then
-        totaltxt := totaltxt + s
+        totaltxt := totaltxt + txt
      else
        totaltxt := totaltxt +
-          '<text:span text:style-name="' + FntStyle + '">' + s + '</text:span>';
+          '<text:span text:style-name="' + FntStyle + '">' + txt + '</text:span>';
      if NewLine then
        totaltxt := totaltxt + '</text:p><text:p>';
    end;
@@ -9322,6 +9295,10 @@ begin
  end
  else
  begin
+    // Split AValue into separate code points.
+    chArray := UTF8CodePoints(AValue);
+    len := Length(chArray);
+
    // No hyperlink, normal text only
    if Length(ACell^.RichTextParams) = 0 then
    begin
@@ -9334,73 +9311,33 @@ begin
        (until LO gets fixed).
      }
      totaltxt := '<text:p>';
-      len := Length(AValue);
-      idx := 1;
-      while (idx <= len) do
+      idx := 0;
+      txt := '';
+      while (idx < len) do
      begin
-        case AValue[idx] of
-          #13: begin
-                 totaltxt := totaltxt + '</text:p><text:p>';
-                 if (idx < len) and (AValue[idx+1] = #10) then inc(idx);
-               end;
-          #10: begin
-                 totaltxt := totaltxt + '</text:p><text:p>';
-                 if (idx < len) and (AValue[idx+1] = #13) then inc(idx);
-               end;
-          else totaltxt := totaltxt + AValue[idx];
-        end;
+        if IsNewLine(idx) then
+          AppendTxt(true, '')
+        else
+          totaltxt := totaltxt + chArray[idx];
        inc(idx);
      end;
      totaltxt := totaltxt + '</text:p>';
    end else
    begin
      // "Rich-text" formatting
-      txt := '';
-      {$IFDEF FPS_NO_LAZUNICODE}
-      wideStr := UTF8Decode(AValue);  // Convert to unicode
-      len := Length(wideStr);
-      {$ELSE}
-      // Split utf8-encoded string into the individual code points and store them
-      // in an array of strings.
-      // Index 0 of this array is ignored so that indices map to a normal string.
-      // An auxiliary #0 is appended so that the case of #13#10 at the end of
-      // AValue can be handled without complicated checks.
-      SetLength(chArray, Length(AValue));
-      i := 0;
-      for ch in AValue do
-      begin
-        chArray[i] := ch;
-        inc(i);
-      end;
-      len := i;
-      SetLength(chArray, len+2);
-      i := length(chArray)-1;
-      chArray[i] := #0;
-      dec(i);
-      while i > 0 do
-      begin
-        chArray[i] := chArray[i-1];
-        dec(i);
-      end;
-      chArray[0] := #0;
-      {$ENDIF}
-      // Before the first formatted section having the cell's format
+      // Before the first formatted section, chars are having the cell's format
      totaltxt := '<text:p>';
      rtParam := ACell^.RichTextParams[0];
-      idx := 1;
+      idx := 0;
+      txt := '';
      if rtParam.FirstIndex > 1 then
      begin
-        while (idx <= len) and (idx < rtParam.FirstIndex) do
+        while (idx < len) and (idx + 1 < rtParam.FirstIndex) do
        begin
-          {$IFDEF FPS_NO_LAZUNICODE}
-          ch := wideStr[idx];
-          {$ELSE}
-          ch := chArray[idx];
-          {$ENDIF}
          if IsNewLine(idx) then
            AppendTxt(true, '')
          else
-            txt := txt + ch;
+            txt := txt + chArray[idx];
          inc(idx);
        end;
        if txt <> '' then
@@ -9409,7 +9346,7 @@ begin
      txt := '';
      for i := 0 to High(ACell^.RichTextParams) do
      begin
-        // Formatted parts of the string according the RichTextParams
+        // Formatted parts of the string according to the RichTextParams
        rtParam := ACell^.RichTextParams[i];
        fnt := (FWorkbook as TsWorkbook).GetFont(rtParam.FontIndex);
        fntidx := FRichTextFontList.IndexOfObject(fnt);
@@ -9418,17 +9355,12 @@ begin
          endidx := ACell^.RichTextParams[i+1].FirstIndex-1
        else
          endidx := len;
-        while (idx <= len) and (idx <= endidx) do
+        while (idx < len) and (idx < endidx) do
        begin
-          {$IFDEF FPS_NO_LAZUNICODE}
-          ch := wideStr[idx];
-          {$ELSE}
-          ch := chArray[idx];
-          {$ENDIF}
          if IsNewLine(idx) then
            AppendTxt(true, fntName)
          else
-            txt := txt + ch;
+            txt := txt + chArray[idx];
          inc(idx);
        end;
        if txt <> '' then
--- a/components/fpspreadsheet/source/common/fpsutils.pas
+++ b/components/fpspreadsheet/source/common/fpsutils.pas
@@ -12,7 +12,6 @@ unit fpsUtils;
 // to do: Remove the declaration UTF8FormatSettings and InitUTF8FormatSettings
 //        when this same modification is in LazUtils of Laz stable

-
 {$mode objfpc}{$H+}

 interface
@@ -242,6 +241,8 @@ function GetFontAsString(AFont: TsFont): String;

 function ISO8601StrToDateTime(s: String): TDateTime;

+function UTF8CodePoints(s: string): TStringArray;
+
 procedure AppendToStream(AStream: TStream; const AString: String); inline; overload;
 procedure AppendToStream(AStream: TStream; const AString1, AString2: String); inline; overload;
 procedure AppendToStream(AStream: TStream; const AString1, AString2, AString3: String); inline; overload;
@@ -260,7 +261,9 @@ var
 implementation

 uses
-  Math, lazutf8, lazfileutils, fpsStrings, fpsReaderWriter;
+  Math, lazutf8, lazfileutils,
+  {$IFNDEF FPS_NO_LAZUNICODE}LazUnicode,{$ENDIF}
+  fpsStrings, fpsReaderWriter;

 const
  INT_NUM_LETTERS = 26;
@@ -3214,6 +3217,44 @@ begin
    Result := Result - GetLocalTimeOffset / MinsPerDay;
 end;

+{@@ ----------------------------------------------------------------------------
+  Splits the UTF8-encoded string into its code-points ("characters") and
+  returns them as an array of string.
+-------------------------------------------------------------------------------}
+function UTF8CodePoints(s: String): TStringArray;
+const
+  BROKEN_UTF8_REPLACEMENT = #$E2#$8E#$95; // Box character. Could also be a '?'.
+var
+  n: Integer;
+  ch: String;
+  P, PEnd: PChar;
+  chLen: Integer;
+begin
+  if s = '' then
+  begin
+    Result := nil;
+    exit;
+  end;
+  n := 0;
+  SetLength(Result, Length(s));
+  P := PChar(s);
+  PEnd := P + Length(s);
+  while P < PEnd do
+  begin
+    chLen := UTF8CodePointSize(P);
+    if (chLen = 1) and (P^ > #127) then
+      ch := BROKEN_UTF8_REPLACEMENT
+    else
+    begin
+      SetLength(ch, chLen);
+      Move(P^, ch[1], chLen);
+    end;
+    Result[n] := ch;
+    inc(P, chLen);
+    inc(n);
+  end;
+  SetLength(Result, n);
+end;

 {$PUSH}{$HINTS OFF}
 {@@ Silence warnings due to an unused parameter }
--- a/components/fpspreadsheet/source/fps.inc
+++ b/components/fpspreadsheet/source/fps.inc
@@ -31,11 +31,6 @@
  FPS_LAZUTF8. Keep undefined for the current Lazarus version. }
 {.$DEFINE FPS_LAZUTF8}

-{ fpspreadsheet requires some functions from unit LazUnicode which was
-  added to Lazarus in v1.8. If an older Lazarus version is used define
-  FPS_NO_LAZUNICODE. Keep undefined for the current Lazarus version. }
-{.$DEFINE FPS_NO_LAZUNICODE}
-  
 { In order to allow selection of multiple ranges in the WorksheetGrid a
  sufficiently new version of the basic TCustomGrid is required. The required
  property "RangeSelect" was introduced in Lazarus 1.4. In order to compile