unit fpsHTMLUtils; {$mode objfpc}{$H+} interface uses Classes, SysUtils, contnrs, fpstypes, fpspreadsheet; type TsHTMLEntity = record E: String; Ch: String; N: Word; end; function CleanHTMLString(AText: String): String; function IsHTMLEntity(AText: PChar; out AEntity: TsHTMLEntity): Boolean; function RemoveHTMLEntities(const AText: String): String; type TsHTMLAttr = class Name: String; Value: String; constructor Create(AName, AValue: String); end; TsHTMLAttrList = class(TObjectList) private function GetItem(AIndex: Integer): TsHTMLAttr; procedure SetItem(AIndex: Integer; AValue: TsHTMLAttr); protected procedure ParseStyle(AStyle: String); public function IndexOfName(AName: String): Integer; procedure Parse(AHTML: String); property Items[AIndex: Integer]: TsHTMLAttr read GetItem write SetItem; default; end; TsTagCase = (tcLowercase, tcUppercase, tcProperCase); procedure HTMLToRichText(AWorkbook: TsWorkbook; AFont: TsFont; const AHTMLText: String; out APlainText: String; out ARichTextParams: TsRichTextParams); procedure RichTextToHTML(AWorkbook: TsWorkbook; AFont: TsFont; const APlainText: String; const ARichTextParams: TsRichTextParams; out AHTMLText: String; APrefix:String = ''; ATagCase: TsTagCase = tcLowercase); implementation uses math, lazUtf8, fasthtmlparser, fpsUtils, fpsClasses; const // http://unicode.e-workers.de/entities.php // http://www.utf8-zeichentabelle.de/unicode-utf8-table.pl HTMLEntities: array[0..250] of TsHTMLEntity = ( // A (E: 'Acirc'; Ch: 'Â'; N: 194), // 0 (E: 'acirc'; Ch: 'â'; N: 226), (E: 'acute'; Ch: '´'; N: 180), (E: 'AElig'; Ch: 'Æ'; N: 198), (E: 'aelig'; Ch: 'æ'; N: 230), (E: 'Agrave'; Ch: 'À'; N: 192), (E: 'agrave'; Ch: 'à'; N: 224), (E: 'alefsym';Ch: #$E2#$84#$B5; N: 8501), (E: 'Alpha'; Ch: 'Α'; N: 913), (E: 'alpha'; Ch: 'α'; N: 945), (E: 'amp'; Ch: '&'; N: 38), // 10 (E: 'and'; Ch: '∧'; N: 8743), (E: 'ang'; Ch: '∠'; N: 8736), (E: 'apos'; Ch: ''''; N: 39), (E: 'Aring'; Ch: 'Å'; N: 197), (E: 'aring'; Ch: 'å'; N: 229), (E: 'asymp'; Ch: '≈'; N: 2248), (E: 'Atilde'; Ch: 'Ã'; N: 195), (E: 'atilde'; Ch: 'ã'; N: 227), (E: 'Auml'; Ch: 'Ä'; N: 196), (E: 'auml'; Ch: 'ä'; N: 228), // 20 // B (E: 'bdquo'; Ch: '„'; N: 8222), // 21 (E: 'Beta'; Ch: 'Β'; N: 914), (E: 'beta'; Ch: 'β'; N: 946), (E: 'brvbar'; Ch: '¦'; N: 166), (E: 'bull'; Ch: '•'; N: 8226), // C (E: 'cap'; Ch: '∩'; N: 8745), // 26 (E: 'Ccedil'; Ch: 'Ç'; N: 199), (E: 'ccedil'; Ch: 'ç'; N: 231), (E: 'cedil'; Ch: '¸'; N: 184), (E: 'cent'; Ch: '¢'; N: 162), // 30 (E: 'Chi'; Ch: 'Χ'; N: 935), (E: 'chi'; Ch: 'χ'; N: 967), (E: 'circ'; Ch: 'ˆ'; N: 710), (E: 'clubs'; Ch: '♣'; N: 9827), (E: 'cong'; Ch: #$E2#$89#$85; N: 8773), // approximately equal (E: 'copy'; Ch: '©'; N: 169), (E: 'crarr'; Ch: #$E2#$86#$B5; N: 8629), // carriage return (E: 'cup'; Ch: '∪'; N: 8746), (E: 'curren'; Ch: '¤'; N: 164), // D (E: 'Dagger'; Ch: '‡'; N: 8225), // 40 (E: 'dagger'; Ch: '†'; N: 8224), (E: 'dArr'; Ch: #$E2#$87#$93; N: 8659), // wide down-arrow (E: 'darr'; Ch: '↓'; N: 8595), // narrow down-arrow (E: 'deg'; Ch: '°'; N: 176), (E: 'Delta'; Ch: 'Δ'; N: 916), (E: 'delta'; Ch: 'δ'; N: 948), (E: 'diams'; Ch: '♦'; N: 9830), (E: 'divide'; Ch: '÷'; N: 247), // E (E: 'Eacute'; Ch: 'É'; N: 201), // 49 (E: 'eacute'; Ch: 'é'; N: 233), // 50 (E: 'Ecirc'; Ch: 'Ê'; N: 202), (E: 'ecirc'; Ch: 'ê'; N: 234), (E: 'Egrave'; Ch: 'È'; N: 200), (E: 'egrave'; Ch: 'è'; N: 232), (E: 'empty'; Ch: #$e2#$88#$85; N: 8709), (E: 'emsp'; Ch: #$E2#$80#$83; N: 8195), // Space character width of "m" (E: 'ensp'; Ch: #$E2#$80#$82; N: 8194), // Space character width of "n" (E: 'Epsilon';Ch: 'Ε'; N: 917), // capital epsilon (E: 'epsilon';Ch: 'ε'; N: 949), (E: 'equiv'; Ch: '≡'; N: 8801), // 60 (E: 'Eta'; Ch: 'Η'; N: 919), (E: 'eta'; Ch: 'η'; N: 951), (E: 'ETH'; Ch: 'Ð'; N: 208), (E: 'eth'; Ch: 'ð'; N: 240), (E: 'Euml'; Ch: 'Ë'; N: 203), (E: 'euml'; Ch: 'ë'; N: 235), (E: 'euro'; Ch: '€'; N: 8364), (E: 'exist'; Ch: '∃'; N: 8707), // F (E: 'fnof'; Ch: 'ƒ'; N: 402), // 70 (E: 'forall'; Ch: '∀'; N: 8704), (E: 'frac12'; Ch: '½'; N: 189), (E: 'frac14'; Ch: '¼'; N: 188), (E: 'frac34'; Ch: '¾'; N: 190), (E: 'frasl'; Ch: '⁄'; N: 8260), // G (E: 'Gamma'; Ch: 'Γ'; N: 915), (E: 'gamma'; Ch: 'γ'; N: 947), (E: 'ge'; Ch: '≥'; N: 8805), (E: 'gt'; Ch: '>'; N: 62), // H (E: 'hArr'; Ch: '⇔'; N: 8660), // 80, wide horizontal double arrow (E: 'harr'; Ch: '↔'; N: 8596), // narrow horizontal double arrow (E: 'hearts'; Ch: '♥'; N: 9829), (E: 'hellip'; Ch: '…'; N: 8230), // I (E: 'Iacute'; Ch: 'Í'; N: 205), (E: 'iacute'; Ch: 'í'; N: 237), (E: 'Icirc'; Ch: 'Î'; N: 206), (E: 'icirc'; Ch: 'î'; N: 238), (E: 'iexcl'; Ch: '¡'; N: 161), (E: 'Igrave'; Ch: 'Ì'; N: 204), (E: 'igrave'; Ch: 'ì'; N: 236), // 90 (E: 'image'; Ch: #$E2#$84#$91; N: 2465), // I in factura (E: 'infin'; Ch: '∞'; N: 8734), (E: 'int'; Ch: '∫'; N: 8747), (E: 'Iota'; Ch: 'Ι'; N: 921), (E: 'iota'; Ch: 'ι'; N: 953), (E: 'iquest'; Ch: '¿'; N: 191), (E: 'isin'; Ch: '∈'; N: 8712), (E: 'Iuml'; Ch: 'Ï'; N: 207), (E: 'iuml'; Ch: 'ï'; N: 239), // K (E: 'Kappa'; Ch: 'Κ'; N: 922), // 100 (E: 'kappa'; Ch: 'κ'; N: 254), // L (E: 'Lambda'; Ch: 'Λ'; N: 923), (E: 'lambda'; Ch: 'λ'; N: 955), (E: 'lang'; Ch: '⟨'; N: 9001), // Left-pointing angle bracket (E: 'laquo'; Ch: '«'; N: 171), (E: 'lArr'; Ch: #$E2#$87#$90; N: 8656), // Left-pointing wide arrow (E: 'larr'; Ch: '←'; N: 8592), (E: 'lceil'; Ch: '⌈'; N: 8968), // Left ceiling (E: 'ldquo'; Ch: '“'; N: 8220), (E: 'le'; Ch: '≤'; N: 8804), // 110 (E: 'lfloor'; Ch: '⌊'; N: 8970), // Left floor (E: 'lowast'; Ch: #$e2#$88#$97; N: 8727), // Low asterisk (E: 'loz'; Ch: '◊'; N: 9674), (E: 'lrm'; Ch: '‎'; N: 8206), // Left-to-right mark (E: 'lsaquo'; Ch: '‹'; N: 8249), (E: 'lsquo'; Ch: '‘'; N: 8216), (E: 'lt'; Ch: '<'; N: 60), // M (E: 'macr'; Ch: '¯'; N: 175), (E: 'mdash'; Ch: '—'; N: 8212), (E: 'micro'; Ch: 'µ'; N: 181), // 120 (E: 'middot'; Ch: '·'; N: 183), (E: 'minus'; Ch: '−'; N: 8722), (E: 'Mu'; Ch: 'Μ'; N: 924), (E: 'mu'; Ch: 'μ'; N: 956), // N (E: 'nabla'; Ch: '∇'; N: 8711), (E: 'nbsp'; Ch: ' '; N: 160), // 126 (E: 'ndash'; Ch: '–'; N: 8211), (E: 'ne'; Ch: '≠'; N: 8800), (E: 'ni'; Ch: '∋'; N: 8715), (E: 'not'; Ch: '¬'; N: 172), // 130 (E: 'notin'; Ch: #$e2#$88#$89; N: 8713), // math: "not in" (E: 'nsub'; Ch: #$e2#$8a#$84; N: 8836), // math: "not a subset of" (E: 'Ntilde'; Ch: 'Ñ'; N: 209), (E: 'ntilde'; Ch: 'ñ'; N: 241), (E: 'Nu'; Ch: 'Ν'; N: 925), (E: 'nu'; Ch: 'ν'; N: 957), // O (E: 'Oacute'; Ch: 'Ó'; N: 211), (E: 'oacute'; Ch: 'ó'; N: 243), (E: 'Ocirc'; Ch: 'Ô'; N: 212), (E: 'ocirc'; Ch: 'ô'; N: 244), (E: 'OElig'; Ch: 'Œ'; N: 338), (E: 'oelig'; Ch: 'œ'; N: 339), (E: 'Ograve'; Ch: 'Ò'; N: 210), (E: 'ograve'; Ch: 'ò'; N: 242), (E: 'oline'; Ch: '‾'; N: 8254), (E: 'Omega'; Ch: 'Ω'; N: 937), (E: 'omega'; Ch: 'ω'; N: 969), (E: 'Omicron';Ch: 'Ο'; N: 927), (E: 'omicron';Ch: 'ο'; N: 959), (E: 'oplus'; Ch: #$e2#$8a#$95; N: 8853), // Circled plus (E: 'or'; Ch: '∨'; N: 8744), (E: 'ordf'; Ch: 'ª'; N: 170), (E: 'ordm'; Ch: 'º'; N: 186), (E: 'Oslash'; Ch: 'Ø'; N: 216), (E: 'oslash'; Ch: 'ø'; N: 248), (E: 'Otilde'; Ch: 'Õ'; N: 213), (E: 'otilde'; Ch: 'õ'; N: 245), (E: 'otimes'; Ch: #$E2#$8A#$97; N: 8855), // Circled times (E: 'Ouml'; Ch: 'Ö'; N: 214), (E: 'ouml'; Ch: 'ö'; N: 246), // P (E: 'para'; Ch: '¶'; N: 182), (E: 'part'; Ch: '∂'; N: 8706), (E: 'permil'; Ch: '‰'; N: 8240), (E: 'perp'; Ch: '⊥'; N: 8869), (E: 'Phi'; Ch: 'Φ'; N: 934), (E: 'phi'; Ch: 'φ'; N: 966), (E: 'Pi'; Ch: 'Π'; N: 928), (E: 'pi'; Ch: 'π'; N: 960), // lower-case pi (E: 'piv'; Ch: 'ϖ'; N: 982), (E: 'plusmn'; Ch: '±'; N: 177), (E: 'pound'; Ch: '£'; N: 163), (E: 'Prime'; Ch: '″'; N: 8243), (E: 'prime'; Ch: '′'; N: 8242), (E: 'prod'; Ch: '∏'; N: 8719), (E: 'prop'; Ch: '∝'; N: 8733), (E: 'Psi'; Ch: 'Ψ'; N: 936), (E: 'psi'; Ch: 'ψ'; N: 968), // Q (E: 'quot'; Ch: '"'; N: 34), // R (E: 'radic'; Ch: '√'; N: 8730), (E: 'rang'; Ch: '⟩'; N: 9002), // right-pointing angle bracket (E: 'raquo'; Ch: '»'; N: 187), (E: 'rArr'; Ch: '⇒'; N: 8658), (E: 'rarr'; Ch: '→'; N: 8594), (E: 'rceil'; Ch: '⌉'; N: 8969), // right ceiling (E: 'rdquo'; Ch: '”'; N: 8221), (E: 'real'; Ch: #$E2#$84#$9C; N: 8476), // R in factura (E: 'reg'; Ch: '®'; N: 174), (E: 'rfloor'; Ch: '⌋'; N: 8971), // Right floor (E: 'Rho'; Ch: 'Ρ'; N: 929), (E: 'rho'; Ch: 'ρ'; N: 961), (E: 'rlm'; Ch: ''; N: 8207), // right-to-left mark (E: 'rsaquo'; Ch: '›'; N: 8250), (E: 'rsquo'; Ch: '’'; N: 8217), // S (E: 'sbquo'; Ch: '‚'; N: 8218), (E: 'Scaron'; Ch: 'Š'; N: 352), (E: 'scaron'; Ch: 'š'; N: 353), (E: 'sdot'; Ch: #$E2#$8B#$85; N: 8901), // math: dot operator (E: 'sect'; Ch: '§'; N: 167), (E: 'shy'; Ch: #$C2#$AD; N: 173), // conditional hyphen (E: 'Sigma'; Ch: 'Σ'; N: 931), (E: 'sigma'; Ch: 'σ'; N: 963), (E: 'sigmaf'; Ch: 'ς'; N: 962), (E: 'sim'; Ch: #$E2#$88#$BC; N: 8764), // similar (E: 'spades'; Ch: '♠'; N: 9824), (E: 'sub'; Ch: '⊂'; N: 8834), (E: 'sube'; Ch: '⊆'; N: 8838), (E: 'sum'; Ch: '∑'; N: 8721), (E: 'sup'; Ch: '⊃'; N: 8835), (E: 'sup1'; Ch: '¹'; N: 185), (E: 'sup2'; Ch: '²'; N: 178), (E: 'sup3'; Ch: '³'; N: 179), (E: 'supe'; Ch: '⊇'; N: 8839), (E: 'szlig'; Ch: 'ß'; N: 223), //T (E: 'Tau'; Ch: 'Τ'; N: 932), (E: 'tau'; Ch: 'τ'; N: 964), (E: 'there4'; Ch: '∴'; N: 8756), (E: 'Theta'; Ch: 'Θ'; N: 920), (E: 'theta'; Ch: 'θ'; N: 952), (E: 'thetasym';Ch: 'ϑ'; N: 977), (E: 'thinsp'; Ch: #$E2#$80#$89; N: 8201), // thin space (E: 'THORN'; Ch: 'Þ'; N: 222), (E: 'thorn'; Ch: 'þ'; N: 254), (E: 'tilde'; Ch: '˜'; N: 732), (E: 'times'; Ch: '×'; N: 215), (E: 'trade'; Ch: '™'; N: 8482), // U (E: 'Uacute'; Ch: 'Ú'; N: 218), (E: 'uacute'; Ch: 'ú'; N: 250), (E: 'uArr'; Ch: #$E2#$87#$91; N: 8657), // wide up-arrow (E: 'uarr'; Ch: '↑'; N: 8593), (E: 'Ucirc'; Ch: 'Û'; N: 219), (E: 'ucirc'; Ch: 'û'; N: 251), (E: 'Ugrave'; Ch: 'Ù'; N: 217), (E: 'ugrave'; Ch: 'ù'; N: 249), (E: 'uml'; Ch: '¨'; N: 168), (E: 'upsih'; Ch: 'ϒ'; N: 978), (E: 'Upsilon';Ch: 'Υ'; N: 933), (E: 'upsilon';Ch: 'υ'; N: 965), (E: 'Uuml'; Ch: 'Ü'; N: 220), (E: 'uuml'; Ch: 'ü'; N: 252), // W (E: 'weierp'; Ch: #$E2#$84#$98; N: 8472), // Script Capital P; Weierstrass Elliptic Function // X (E: 'Xi'; Ch: 'Ξ'; N: 926), (E: 'xi'; Ch: 'ξ'; N: 958), // Y (E: 'Yacute'; Ch: 'Ý'; N: 221), (E: 'yacute'; Ch: 'ý'; N: 253), (E: 'yen'; Ch: '¥'; N: 165), (E: 'Yuml'; Ch: 'Ÿ'; N: 376), (E: 'yuml'; Ch: 'ÿ'; N: 255), // Z (E: 'Zeta'; Ch: 'Ζ'; N: 918), (E: 'zeta'; Ch: 'ζ'; N: 950), (E: 'zwj'; Ch: ''; N: 8205), // Zero-width joiner (E: 'zwnj'; Ch: ''; N: 8204) // Zero-width non-joiner ); function IsHTMLEntity(AText: PChar; out AEntity: TsHTMLEntity): Boolean; function Compare(s: String): Boolean; var j: Integer; begin Result := false; for j:=1 to Length(s) do if s[j] <> PChar(AText)[j-1] then exit; if PChar(AText)[Length(s)] <> ';' then exit; Result := true; end; var k: Integer; equ: Boolean; ch1: Char; P: PChar; begin Result := false; for k:=0 to High(HTMLEntities) do begin equ := Compare(HTMLEntities[k].E); if not equ then begin P := AText; ch1 := P^; if ch1 = '#' then begin inc(P); if ch1 = 'x' then equ := Compare(Format('#x%x', [HTMLEntities[k].N])) else equ := Compare(Format('#%d', [HTMLEntities[k].N])); end; end; if equ then begin AEntity := HTMLEntities[k]; Result := true; exit; end; end; end; function CleanHTMLString(AText: String): String; var ent: TsHTMLEntity; P: PChar; ch: Char; hasStartSpace, hasEndSpace: Boolean; begin Result := ''; // Remove leading and trailing spaces and line endings coming from formatted // source lines. Retain 1 single space, at the end even without spaces found. // No idea if this is 100% correct - at least, looks good. hasStartSpace := false; while (Length(AText) > 0) and (AText[1] in [#9, #13, #10, ' ']) do begin if AText[1] = ' ' then hasStartSpace := true; // A leading space will be added later Delete(AText, 1, 1); end; hasEndSpace := false; while (Length(AText) > 0) and (AText[Length(AText)] in [#9, #10, #13, ' ']) do begin hasEndSpace := true; // A trailing space will be added later Delete(AText, Length(AText), 1); end; if AText = '' then begin if hasStartSpace or hasEndSpace then Result := ' '; exit; end; // Replace HTML entities by their counter part UTF8 characters P := @AText[1]; while (P^ <> #0) do begin ch := P^; case ch of ' ': begin // collapse multiple spaces to a single space (HTML spec) // http://stackoverflow.com/questions/24615355/browser-white-space-rendering Result := Result + ' '; inc(P); while (P^ = ' ') do inc(P); dec(P); end; '&': begin inc(P); if (P <> nil) and IsHTMLEntity(P, ent) then begin Result := Result + ent.Ch; inc(P, Length(ent.E)); end else begin Result := Result + '&'; Continue; end; end; else Result := Result + ch; end; inc(P); end; // Add leading and trailing spaces from above. if hasStartSpace then Result := ' ' + Result; if hasEndSpace then Result := Result + ' '; end; function RemoveHTMLEntities(const AText: String): String; var ent: TsHTMLEntity; P: PChar; ch: AnsiChar; begin Result := ''; P := @AText[1]; while (P^ <> #0) do begin ch := P^; case ch of '&': begin inc(P); if (P <> nil) and IsHTMLEntity(P, ent) then begin Result := Result + ent.Ch; inc(P, Length(ent.E)); end else begin Result := Result + '&'; Continue; end; end; else Result := Result + ch; end; inc(P); end; end; {==============================================================================} { TsHTMLAttr } {==============================================================================} constructor TsHTMLAttr.Create(AName, AValue: String); begin Name := AName; Value := AValue; end; {==============================================================================} { TsHTMLAttrList } {==============================================================================} function TsHTMLAttrList.GetItem(AIndex: Integer): TsHTMLAttr; begin Result := TsHTMLAttr(inherited GetItem(AIndex)); end; function TsHTMLAttrList.IndexOfName(AName: String): Integer; begin AName := Lowercase(AName); for Result := 0 to Count-1 do if GetItem(Result).Name = AName then exit; Result := -1; end; { AHTML is a HTML string beginning with a < tag. Seeks the first space to split off the HTML tag. Then seeks for = and " characters to extract the attributes which are split into name/value pairs at the = character. The value part is unquoted. } procedure TsHTMLAttrList.Parse(AHTML: String); var i: Integer; len: Integer; value, nam: String; begin Clear; if (AHTML[1] <> '<') then // just for simplification raise Exception.Create('[THTMLAttrList.Parse] HTML tags expected.'); // Find first space i := 1; len := Length(AHTML); while (i <= len) and (AHTML[i] <> ' ') do inc(i); // Parse attribute string nam := ''; while (i <= len) do begin case AHTML[i] of '=': begin inc(i); value := ''; if AHTML[i] = '"' then begin inc(i); // skip the initial '"' while AHTML[i] <> '"' do begin value := value + AHTML[i]; inc(i); end; inc(i); // skip the final '"' end else while not (AHTML[i] in [' ', '>', '/']) do begin value := value + AHTML[i]; inc(i); end; Add(TsHTMLAttr.Create(lowercase(trim(nam)), trim(value))); nam := ''; end; ' ', '/', '>': ; else nam := nam + AHTML[i]; end; inc(i); end; i := IndexOfName('style'); if i > -1 then ParseStyle(Items[i].Value); end; { AStyle is the value part of a 'style="...."' HTML string. Splits the into individual records at the semicolons (;) and into name-value pairs at the colon (:). Adds the name-value pairs to the list. } procedure TsHTMLAttrList.ParseStyle(AStyle: String); var i, len: Integer; value, nam: String; begin i := 1; len := Length(AStyle); // skip white space while (i <= len) and (AStyle[i] = ' ') do inc(i); // iterate through string nam := ''; while (i <= len) do begin case AStyle[i] of ':': begin // name-value separator inc(i); // skip ':' ... while (i <= len) and (AStyle[i] = ' ') do inc(i); // ... and white space value := ''; while (i <= len) and (AStyle[i] <> ';') do begin value := value + AStyle[i]; inc(i); end; Add(TsHTMLAttr.Create(lowercase(trim(nam)), UnquoteStr(trim(value)))); nam := ''; end; ' ': ; // skip white space else nam := nam + AStyle[i]; end; inc(i); end; end; procedure TsHTMLAttrList.SetItem(AIndex: Integer; AValue: TsHTMLAttr); begin inherited SetItem(AIndex, AValue); end; {==============================================================================} { HTML-to-Rich-text conversion } {==============================================================================} type TsHTMLAnalyzer = class(THTMLParser) private FWorkbook: TsWorkbook; FPlainText: String; FRichTextParams: TsRichTextParams; FAttrList: TsHTMLAttrList; FFontStack: TsIntegerStack; FCurrFont: TsFont; FPointSeparatorSettings: TFormatSettings; FPreserveSpaces: Boolean; function AddFont(AFont: TsFont): Integer; procedure AddRichTextParam(AFont: TsFont; AHyperlinkIndex: Integer = -1); procedure ProcessFontRestore; procedure ReadFont(AFont: TsFont); procedure TagFoundHandler(NoCaseTag, ActualTag: string); procedure TextFoundHandler(AText: string); public constructor Create(AWorkbook: TsWorkbook; AFont: TsFont; AText: String); destructor Destroy; override; property PlainText: String read FPlainText; property RichTextParams: TsRichTextParams read FRichTextParams; property PreserveSpaces: Boolean read FPreserveSpaces write FPreserveSpaces; end; constructor TsHTMLAnalyzer.Create(AWorkbook: TsWorkbook; AFont: TsFont; AText: String); begin if AWorkbook = nil then raise Exception.Create('[TsHTMLAnalyzer.Create] Workbook required.'); if AFont = nil then raise Exception.Create('[TsHTMLAnalyzer.Create] Font required.'); inherited Create(AText); FWorkbook := AWorkbook; OnFoundTag := @TagFoundHandler; OnFoundText := @TextFoundHandler; FPlainText := ''; SetLength(FRichTextParams, 0); FAttrList := TsHTMLAttrList.Create; FCurrFont := TsFont.Create; FCurrFont.CopyOf(AFont); FFontStack := TsIntegerStack.Create; FPointSeparatorSettings := DefaultFormatSettings; FPointSeparatorSettings.DecimalSeparator := '.'; end; destructor TsHTMLAnalyzer.Destroy; begin FreeAndNil(FFontStack); FreeAndNil(FCurrFont); FreeAndNil(FAttrList); inherited Destroy; end; { Stores a font in the workbook's font list. Does not allow duplicates. } function TsHTMLAnalyzer.AddFont(AFont: TsFont): Integer; var fnt: TsFont; begin // Is the font already stored in the workbook's font list? Result := FWorkbook.FindFont(AFont.FontName, AFont.Size, AFont.Style, AFont.Color, AFont.Position); if Result = -1 then begin // No. Create a new font, add it to the list, and return the new index. fnt := TsFont.Create; fnt.CopyOf(AFont); Result := FWorkbook.AddFont(fnt); end; end; procedure TsHTMLAnalyzer.AddRichTextParam(AFont: TsFont; AHyperlinkIndex: Integer = -1); var len: Integer; fntIndex: Integer; n: Integer; begin n := Length(FRichTextParams); len := UTF8Length(FPlainText); fntIndex := AddFont(AFont); if (n > 0) and (FRichTextparams[n-1].FirstIndex = len+1) then begin // Avoid adding another rich-text parameter for the same text location: // Update the previous one FRichTextParams[n-1].FontIndex := fntIndex; FRichTextParams[n-1].HyperlinkIndex := AHyperlinkIndex; end else begin // Add a new rich-text parameter SetLength(FRichTextParams, n+1); FRichTextParams[n].FirstIndex := len + 1; FRichTextParams[n].FontIndex := fntIndex; FRichTextParams[n].HyperlinkIndex := AHyperlinkIndex; end; end; procedure TsHTMLAnalyzer.ProcessFontRestore; var fntIndex: Integer; begin fntIndex := FFontStack.Pop; if fntIndex > -1 then begin FCurrFont.CopyOf(FWorkbook.GetFont(fntIndex)); AddRichTextParam(FCurrFont); end; end; procedure TsHTMLAnalyzer.ReadFont(AFont: TsFont); const FACTOR = 1.2; MIN_FONTSIZE = 6; var idx: Integer; L: TStringList; i, ip, im: Integer; s: String; f: Double; defFntSize: Single; begin idx := FAttrList.IndexOfName('font-family'); // style tag if idx = -1 then idx := FAttrList.IndexOfName('face'); // html tag if idx > -1 then begin L := TStringList.Create; try L.StrictDelimiter := true; L.DelimitedText := FAttrList[idx].Value; AFont.FontName := L[0]; finally L.Free; end; end; idx := FAttrList.IndexOfName('font-size'); if idx = -1 then idx := FAttrList.IndexOfName('size'); if idx > -1 then begin defFntSize := FWorkbook.GetDefaultFont.Size; s := FAttrList[idx].Value; case s of 'medium', '3' : AFont.Size := defFntSize; 'large', '4' : AFont.Size := defFntSize*FACTOR; 'x-large', '5' : AFont.Size := defFntSize*FACTOR*FACTOR; 'xx-large', '6' : AFont.Size := defFntSize*FACTOR*FACTOR*FACTOR; 'small', '2' : AFont.Size := Max(MIN_FONTSIZE, defFntSize/FACTOR); 'x-small' : AFont.Size := Max(MIN_FONTSIZE, defFntSize/FACTOR/FACTOR); 'xx-small', '1' : AFont.Size := Max(MIN_FONTSIZE, defFntSize/FACTOR/FACTOR/FACTOR); 'larger' : AFont.Size := AFont.Size * FACTOR; 'smaller' : AFont.Size := Max(MIN_FONTSIZE, AFont.Size / FACTOR); else if s[1] in ['+', '-'] then begin TryStrToInt(s, i); AFont.Size := defFntSize * IntPower(FACTOR, i); end else begin i := 0; im := 0; ip := pos('%', s); if ip = 0 then begin im := pos('rem', s); if im = 0 then im := pos('em', s); end; if (ip > 0) then i := ip else if (im > 0) then i := im; if i > 0 then begin s := copy(s, 1, i-1); if TryStrToFloat(s, f, FPointSeparatorSettings) then begin if ip > 0 then f := f * 0.01; AFont.Size := Max(MIN_FONTSIZE, abs(f) * defFntSize); end; end else AFont.Size := Max(MIN_FONTSIZE, HTMLLengthStrToPts(s)); end; end; end; idx := FAttrList.IndexOfName('font-style'); if idx > -1 then case FAttrList[idx].Value of 'normal' : Exclude(AFont.Style, fssItalic); 'italic' : Include(AFont.Style, fssItalic); 'oblique' : Include(AFont.Style, fssItalic); end; idx := FAttrList.IndexOfName('font-weight'); if idx > -1 then begin s := FAttrList[idx].Value; if TryStrToInt(s, i) and (i >= 700) then Include(AFont.Style, fssBold); end; idx := FAttrList.IndexOfName('text-decoration'); if idx > -1 then begin s := FAttrList[idx].Value; if pos('underline', s) <> 0 then Include(AFont.Style, fssUnderline); if pos('line-through', s) <> 0 then Include(AFont.Style, fssStrikeout); end; idx := FAttrList.IndexOfName('color'); if idx > -1 then AFont.Color := HTMLColorStrToColor(FAttrList[idx].Value); end; procedure TsHTMLAnalyzer.TagFoundHandler(NoCaseTag, ActualTag: String); begin case NoCaseTag[2] of 'B': case NoCaseTag of '' : begin FFontStack.Push(AddFont(FCurrFont)); Include(FCurrFont.Style, fssBold); AddRichTextParam(FCurrFont); end; '
', '
': FPlainText := FPlainText + FPS_LINE_ENDING; else if (pos('
' : begin FFontStack.Push(AddFont(FCurrFont)); Include(FCurrFont.Style, fssItalic); AddRichTextParam(FCurrFont); end; '': begin FFontStack.Push(AddFont(FCurrFont)); Include(FCurrFont.Style, fssUnderline); AddRichTextParam(FCurrFont); end; end; 'S': case NoCaseTag of '' : begin FFontStack.Push(AddFont(FCurrFont)); Include(FCurrFont.Style, fssStrikeout); AddRichTextParam(FCurrFont); end; '':begin FFontStack.Push(AddFont(FCurrFont)); Include(FCurrFont.Style, fssBold); AddRichTextParam(FCurrFont); end; '': begin FFontStack.Push(AddFont(FCurrFont)); FCurrFont.Position := fpSubscript; AddRichTextParam(FCurrFont); end; '': begin FFontStack.Push(AddFont(FCurrFont)); FCurrFont.Position := fpSuperscript; AddRichTextParam(FCurrFont); end; end; 'U': if (NoCaseTag = '') then begin FFontStack.Push(AddFont(FCurrFont)); Include(FCurrFont.Style, fssUnderline); AddRichTextParam(FCurrFont); end; '/': case NoCaseTag[3] of 'B': if (NoCaseTag) = '
' then ProcessFontRestore; 'D': if (NoCaseTag) = '' then ProcessFontRestore; 'E': if (NoCaseTag) = '' then ProcessFontRestore; 'F': if (NoCaseTag) = '' then ProcessFontRestore; 'I': if (NoCaseTag = '') or (NoCaseTag = '') then ProcessFontRestore; 'S': if (NoCaseTag = '') or (NoCaseTag = '') or (NoCaseTag = '') or (NoCaseTag = '') then ProcessFontRestore; 'U': if (NoCaseTag = '') then ProcessFontRestore; end; end; end; procedure TsHTMLAnalyzer.TextFoundHandler(AText: String); begin if not FPreserveSpaces then AText := CleanHTMLString(AText) else AText := RemoveHTMLEntities(AText); if AText <> '' then begin if FPlainText = '' then FPlainText := AText else FPlainText := FPlainText + AText; end; end; {@@ ---------------------------------------------------------------------------- Extracts rich-text parameters out of an html-formatted string and returns the plain text -------------------------------------------------------------------------------} procedure HTMLToRichText(AWorkbook: TsWorkbook; AFont: TsFont; const AHTMLText: String; out APlainText: String; out ARichTextParams: TsRichTextParams); var analyzer: TsHTMLAnalyzer; j: Integer; begin analyzer := TsHTMLAnalyzer.Create(AWorkbook, AFont, AHTMLText + ''); try analyzer.PreserveSpaces := true; analyzer.Exec; APlainText := analyzer.PlainText; SetLength(ARichTextParams, Length(analyzer.RichTextParams)); for j:=0 to High(ARichTextParams) do ARichTextParams[j] := analyzer.RichTextParams[j]; finally analyzer.Free; end; end; {==============================================================================} { Rich-text-to-HTML conversion } {==============================================================================} type TsHTMLComposer = class private FPointSeparatorSettings: TFormatSettings; FWorkbook: TsWorkbook; FBaseFont: TsFont; FPlainText: String; FRichTextParams: TsRichTextParams; FPrefix: String; FTagCase: TsTagCase; procedure GetFontsFromWorkbook(out AFonts: TsFontArray); function GetTextOfRichTextParam(AIndex: Integer): String; protected function FixTagCase(ATag: String): String; public constructor Create(AWorkbook: TsWorkbook; AFont: TsFont; APrefix: String = ''; ATagCase: TsTagCase = tcLowercase); function Exec(const APlainText: String; const ARichTextParams: TsRichTextParams): String; end; constructor TsHTMLComposer.Create(AWorkbook: TsWorkbook; AFont: TsFont; APrefix: String = ''; ATagCase: TsTagCase = tcLowercase); begin FPointSeparatorSettings := DefaultFormatSettings; FPointSeparatorSettings.DecimalSeparator := '.'; FWorkbook := AWorkbook; FBaseFont := AFont; FPrefix := APrefix; FTagCase := ATagCase; end; function TsHTMLComposer.Exec(const APlainText: String; const ARichTextParams: TsRichTextParams): String; type TChangeFlag = (cfFontName, cfFontSize, cfFontColor); const EPS = 1E-3; var i: Integer; prevFnt, currFnt: TsFont; chgFlags: set of TChangeFlag; openingTag, closingTag: String; fonts: TsFontArray; tag: String; begin if Length(ARichTextParams) = 0 then begin Result := FPlainText; exit; end; FRichTextParams := ARichTextParams; FPlainText := APlainText; prevFnt := TsFont.Create; prevFnt.CopyOf(FBaseFont); if FRichTextParams[0].FirstIndex > 1 then Result := GetTextOfRichTextParam(-1) else Result := ''; GetFontsFromWorkbook(fonts); for i:=0 to High(FRichTextParams) do begin currFnt := fonts[i]; openingTag := ''; closingTag := ''; if not SameFont(currFnt, prevFnt) then begin chgFlags := []; if not SameText(prevFnt.FontName, currFnt.FontName) then Include(chgFlags, cfFontName); if not SameValue(currFnt.Size, prevFnt.Size, EPS) then Include(chgFlags, cfFontSize); if currFnt.Color <> prevFnt.Color then Include(chgFlags, cfFontColor); if [cfFontName, cfFontSize, cfFontColor] * chgFlags <> [] then begin tag := FixTagCase('font'); openingTag := '<' + tag; if cfFontName in chgFlags then begin openingTag := openingTag + ' ' + FPrefix + FixTagCase('face') + '="' + UnquoteStr(currFnt.FontName) + '"'; prevFnt.FontName := currFnt.FontName; end; if cfFontSize in chgFlags then begin openingTag := openingTag + ' ' + FPrefix + FixTagCase('size') + '="' + Format('%.gpt', [currFnt.Size], FPointSeparatorSettings) + '"'; prevFnt.Size := currFnt.Size; end; if cfFontColor in chgFlags then begin openingTag := openingTag + ' ' + FPrefix + FixTagCase('color') + '="' + ColorToHTMLColorStr(currFnt.Color) + '"'; prevFnt.Size := currFnt.Color; end; openingTag := openingTag + '>'; closingTag :='' + closingTag; end; if (fssBold in currFnt.Style) then begin tag := FixTagCase('b'); openingTag := openingTag + '<' + tag + '>'; closingTag := '' + closingTag; prevFnt.Style := prevFnt.Style + [fssBold]; end else prevFnt.Style := prevFnt.Style - [fssBold];; if (fssItalic in currFnt.Style) then begin tag := FixTagCase('i'); openingTag := openingTag + '<' + tag + '>'; closingTag := '' + closingTag; prevFnt.Style := prevFnt.Style + [fssItalic]; end else prevFnt.Style := prevFnt.Style - [fssItalic]; if (fssUnderline in currFnt.Style) then begin tag := FixTagCase('u'); openingTag := openingTag + '<' + tag + '>'; closingTag := '' + closingTag; prevFnt.Style := prevFnt.Style + [fssUnderline]; end else prevFnt.Style := prevFnt.Style - [fssUnderline]; if (fssStrikeout in currFnt.Style) then begin tag := FixTagCase('s'); openingTag := openingTag + '<' + tag + '>'; closingTag := '' + closingTag; prevFnt.Style := prevFnt.Style + [fssStrikeout]; end else prevFnt.Style := prevFnt.Style - [fssStrikeout]; if currFnt.Position <> prevFnt.Position then begin if currFnt.Position = fpSuperscript then begin tag := FixTagCase('sup'); openingTag := openingTag + '<' + tag + '>'; closingTag := '' + closingTag; currFnt.Position := fpSuperscript; end else if currFnt.Position = fpSubscript then begin tag := FixTagCase('sub'); openingTag := openingTag + '<' + tag + '>'; closingTag := '' + closingTag; currFnt.Position := fpSubscript; end else currFnt.Position := fpNormal; end; end; // Add the node text with opening and closing tags (reverse order as opening!) Result := Result + openingTag + GetTextOfRichTextParam(i) + closingTag; end; // for end; function TsHTMLComposer.FixTagCase(ATag: String): String; begin case FTagCase of tcLowercase: Result := Lowercase(ATag); tcUppercase: Result := Uppercase(ATag); tcProperCase: begin Result := Lowercase(ATag); Result[1] := UpCase(Result[1]); end; end; end; procedure TsHTMLComposer.GetFontsFromWorkbook(out AFonts: TsFontArray); var i: Integer; begin SetLength(AFonts, Length(FRichTextParams)); for i:=0 to High(AFonts) do AFonts[i] := FWorkbook.GetFont(FRichTextParams[i].FontIndex); end; function TsHTMLComposer.GetTextOfRichTextParam(AIndex: Integer): String; var p1, p2: Integer; begin if AIndex = -1 then Result := UTF8Copy(FPlainText, 1, FRichTextParams[0].FirstIndex-1) else if AIndex <= High(FRichTextParams) then begin p1 := FRichTextParams[AIndex].FirstIndex; if AIndex < High(FRichTextparams) then p2 := FRichTextParams[AIndex+1].FirstIndex else p2 := UTF8Length(FPlainText) + 1; Result := UTF8Copy(FPlaiNText, p1, p2-p1); end else Result := ''; end; {@@ ---------------------------------------------------------------------------- Constructs a html-coded string from a plain text string and rich-text parameters -------------------------------------------------------------------------------} procedure RichTextToHTML(AWorkbook: TsWorkbook; AFont: TsFont; const APlainText: String; const ARichTextParams: TsRichTextParams; out AHTMLText: String; APrefix: String = ''; ATagCase: TsTagCase = tcLowercase); var composer: TsHTMLComposer; begin if Length(ARichTextParams) = 0 then AHTMLText := APlainText else begin composer := TsHTMLComposer.Create(AWorkbook, AFont, APrefix, ATagCase); try AHTMLText := composer.Exec(APlainText, ARichTextParams); finally composer.Free; end; end; end; end.