Replace function HTMLToCaption with a new THTML2TextRenderer class, placed into LazUtils package. Issue #31991.

git-svn-id: trunk@55319 -
This commit is contained in:
juha 2017-06-11 18:33:28 +00:00
parent b47c467010
commit 1b20d91bba
5 changed files with 286 additions and 163 deletions

1
.gitattributes vendored
View File

@ -2984,6 +2984,7 @@ components/lazutils/fileutil.inc svneol=native#text/pascal
components/lazutils/fileutil.pas svneol=native#text/pascal components/lazutils/fileutil.pas svneol=native#text/pascal
components/lazutils/fpcadds.pas svneol=native#text/pascal components/lazutils/fpcadds.pas svneol=native#text/pascal
components/lazutils/fpmake.pp svneol=native#text/plain components/lazutils/fpmake.pp svneol=native#text/plain
components/lazutils/html2textrender.pas svneol=native#text/pascal
components/lazutils/languages/lazutilsstrconsts.cs.po svneol=native#text/plain components/lazutils/languages/lazutilsstrconsts.cs.po svneol=native#text/plain
components/lazutils/languages/lazutilsstrconsts.de.po svneol=native#text/plain components/lazutils/languages/lazutilsstrconsts.de.po svneol=native#text/plain
components/lazutils/languages/lazutilsstrconsts.es.po svneol=native#text/plain components/lazutils/languages/lazutilsstrconsts.es.po svneol=native#text/plain

View File

@ -0,0 +1,250 @@
{
*****************************************************************************
This file is part of LazUtils.
See the file COPYING.modifiedLGPL.txt, included in this distribution,
for details about the license.
*****************************************************************************
Author: Juha Manninen
Abstract:
Render HTML into plain text by using indentation, newlines and stripping tags.
}
unit HTML2TextRender;
{$mode objfpc}{$H+}
interface
uses
Classes, SysUtils,
// LazUtils
LConvEncoding;
type
{ THTML2TextRenderer
renders HTML into plain text as well as possible }
THTML2TextRenderer = class
private
fHTML, fOutput: string;
fMaxLines: integer;
fInHeader: Boolean;
fPendingSpace: Boolean;
fPendingNewLineCnt: Integer;
fCurTag: String;
fIndent: integer;
fLineCnt, fHtmlLen: Integer;
p: Integer;
procedure AddNewLine;
procedure AddOneNewLine;
function AddOutput(aText: String): Boolean;
function HtmlTag: Boolean;
procedure HtmlEntity;
procedure Reset;
public
constructor Create(aHTML: string);
constructor Create(Stream: TStream);
destructor Destroy; override;
function Render(aMaxLines: integer = MaxInt): string;
end;
implementation
{ THTML2TextRenderer }
constructor THTML2TextRenderer.Create(aHTML: string);
begin
fHTML:=aHTML;
// remove UTF8 BOM
if copy(fHTML,1,3)=UTF8BOM then
delete(fHTML,1,3);
end;
constructor THTML2TextRenderer.Create(Stream: TStream);
var
s: string;
begin
SetLength(s,Stream.Size);
if s<>'' then
Stream.Read(s[1],length(s));
Create(s);
end;
destructor THTML2TextRenderer.Destroy;
begin
inherited Destroy;
end;
procedure THTML2TextRenderer.Reset;
begin
fOutput:='';
fInHeader:=False;
fPendingSpace:=False;
fPendingNewLineCnt:=0;
fLineCnt:=1;
end;
procedure THTML2TextRenderer.AddNewLine;
// set a pending linebreak to be added later
begin
if (fOutput<>'') and not fInHeader then
Inc(fPendingNewLineCnt);
end;
procedure THTML2TextRenderer.AddOneNewLine;
begin
if (fOutput<>'') and not fInHeader then
fPendingNewLineCnt:=1;
end;
function THTML2TextRenderer.AddOutput(aText: String): Boolean;
var
i: Integer;
begin
Result:=True;
if fPendingSpace and (fPendingNewLineCnt=0) then
fOutput:=fOutput+' '; // Don't add space at end of line (before newline)
fPendingSpace:=False;
for i:=0 to fPendingNewLineCnt-1 do
begin
fOutput:=fOutput+LineEnding;
Inc(fLineCnt);
// Return False if max # of lines exceeded.
if fLineCnt>fMaxLines then
begin
fOutput:=fOutput+LineEnding+'...';
Exit(False);
end;
end;
if fPendingNewLineCnt>0 then
begin
fOutput:=fOutput+StringOfChar(' ',fIndent*2);
fPendingNewLineCnt:=0;
end;
fOutput:=fOutput+aText;
end;
function THTML2TextRenderer.HtmlTag: Boolean;
// separate a html tag and use it for layout. '<' is already found here.
// ToDo: parse <div class="title"> and use it.
var
Start: Integer;
begin
// first separate a html tag.
inc(p);
Start:=p;
if (p<=fHtmlLen) and (fHTML[p]='/') then
inc(p);
while (p<=fHtmlLen) and not (fHTML[p] in [' ','>','"','/',#9,#10,#13]) do
inc(p);
fCurTag:=UpperCase(copy(fHTML,Start,p-Start));
while p<=fHtmlLen do
begin
if fHTML[p]='"' then begin // skip attribute "value" inside tag
inc(p);
while (p<=fHtmlLen) and (fHTML[p]<>'"') do
inc(p);
if p>fHtmlLen then break;
end;
inc(p);
if (fHTML[p-1]='>') then break; // end of tag
end;
// adjust layout based on html tag, then remove it
Result:=True;
case fCurTag of
'HTML':
fInHeader:=True; // it's a whole page
'BODY':
Reset; // start of body => ignore header and all its data
'P', '/P', 'BR', '/UL':
AddNewLine;
'DIV': begin
AddOneNewLine;
Inc(fIndent);
end;
'/DIV': begin
AddOneNewLine;
Dec(fIndent);
end;
'LI':
begin
Inc(fIndent);
// Don't leave empty lines before list item (not sure if this is good)
AddOneNewLine;
Result:=AddOutput('* '); // Can return False if MaxLines was exceeded
end;
'/LI':
Dec(fIndent);
'A', '/A': // Link
Result:=AddOutput(' _ ');
'HR':
begin
AddOneNewLine;
Result:=AddOutput('----------');
//AddOneNewLine;
end;
end;
end;
procedure THTML2TextRenderer.HtmlEntity;
// entities: &lt; &gt; &amp; &nbsp;
begin
if (p+2<fHtmlLen) and (fHTML[p+1]='l') and (fHTML[p+2]='t') and (fHTML[p+3]=';') then
begin
Inc(p,4);
AddOutput('<');
end else
if (p+2<fHtmlLen) and (fHTML[p+1]='g') and (fHTML[p+2]='t') and (fHTML[p+3]=';') then
begin
Inc(p,4);
AddOutput('>');
end else
if (p+4<fHtmlLen) and (fHTML[p+1]='n') and (fHTML[p+2]='b') and (fHTML[p+3]='s') and (fHTML[p+4]='p') and (fHTML[p+5]=';') then
begin
Inc(p,6);
AddOutput(' ');
end else
if (p+3<fHtmlLen) and (fHTML[p+1]='a') and (fHTML[p+2]='m') and (fHTML[p+3]='p') and (fHTML[p+4]=';') then
begin
Inc(p,5);
AddOutput('&');
end;
end;
function THTML2TextRenderer.Render(aMaxLines: integer): string;
// Parse the HTML and render to plain text.
// Output is limited to aMaxLines lines.
var
OkToGo: Boolean;
begin
fMaxLines:=aMaxLines;
Reset;
p:=1;
OkToGo:=True;
fHtmlLen:=length(fHTML);
while (p<=fHtmlLen) and OkToGo do
begin
case fHTML[p] of
'<': OkToGo:=HtmlTag; // Can return False if MaxLines was exceeded.
'&': HtmlEntity;
' ',#9,#10,#13: // WhiteSpace
begin
fPendingSpace:=True;
inc(p);
end;
else
begin
AddOutput(fHTML[p]); // Add text verbatim.
inc(p);
end;
end;
end;
Result:=fOutput;
end;
end.

View File

@ -16,7 +16,7 @@
<Description Value="Useful units for Lazarus packages."/> <Description Value="Useful units for Lazarus packages."/>
<License Value="Modified LGPL-2"/> <License Value="Modified LGPL-2"/>
<Version Major="1"/> <Version Major="1"/>
<Files Count="91"> <Files Count="92">
<Item1> <Item1>
<Filename Value="LazLoggerImpl.inc"/> <Filename Value="LazLoggerImpl.inc"/>
<Type Value="Include"/> <Type Value="Include"/>
@ -381,6 +381,10 @@
<Filename Value="winlazutf8.inc"/> <Filename Value="winlazutf8.inc"/>
<Type Value="Include"/> <Type Value="Include"/>
</Item91> </Item91>
<Item92>
<Filename Value="html2textrender.pas"/>
<UnitName Value="HTML2TextRender"/>
</Item92>
</Files> </Files>
<LazDoc Paths="../../docs/xml/lazutils"/> <LazDoc Paths="../../docs/xml/lazutils"/>
<i18n> <i18n>

View File

@ -20,7 +20,7 @@ uses
LConvEncoding, lcsvutils, LookupStringList, Maps, Masks, PasWString, LConvEncoding, lcsvutils, LookupStringList, Maps, Masks, PasWString,
StringHashList, TextStrings, Translations, TTCache, TTCalc, TTCMap, TTDebug, StringHashList, TextStrings, Translations, TTCache, TTCalc, TTCMap, TTDebug,
TTError, TTFile, TTGLoad, TTInterp, TTLoad, TTMemory, TTObjs, TTProfile, TTError, TTFile, TTGLoad, TTInterp, TTLoad, TTMemory, TTObjs, TTProfile,
TTRASTER, TTTables, TTTypes, UTF8Process, LazarusPackageIntf; TTRASTER, TTTables, TTTypes, UTF8Process, HTML2TextRender, LazarusPackageIntf;
implementation implementation

View File

@ -38,7 +38,7 @@ uses
LCLProc, LCLIntf, LCLType, FileProcs, Forms, Controls, ComCtrls, StdCtrls, LCLProc, LCLIntf, LCLType, FileProcs, Forms, Controls, ComCtrls, StdCtrls,
Dialogs, Graphics, Buttons, ButtonPanel, Dialogs, Graphics, Buttons, ButtonPanel,
// LazUtils // LazUtils
LConvEncoding, LazUTF8Classes, LazFileUtils, LConvEncoding, LazUTF8Classes, LazFileUtils, HTML2TextRender,
// CodeTools // CodeTools
BasicCodeTools, CodeToolManager, CodeCache, CustomCodeTool, CodeTree, BasicCodeTools, CodeToolManager, CodeCache, CustomCodeTool, CodeTree,
PascalParserTool, FindDeclarationTool, PascalParserTool, FindDeclarationTool,
@ -117,8 +117,7 @@ type
procedure ReleaseStream(const URL: string); procedure ReleaseStream(const URL: string);
end; end;
{ TSimpleHTMLControl { TSimpleHTMLControl }
At the moment it is a TLabel that simply strips all tags }
TSimpleHTMLControl = class(TLabel,TIDEHTMLControlIntf) TSimpleHTMLControl = class(TLabel,TIDEHTMLControlIntf)
private private
@ -136,8 +135,7 @@ type
property MaxLineCount: integer read FMaxLineCount write FMaxLineCount; property MaxLineCount: integer read FMaxLineCount write FMaxLineCount;
end; end;
{ TScrollableHTMLControl { TScrollableHTMLControl }
At the moment it is a TMemo that simply strips all tags }
TScrollableHTMLControl = class(TMemo,TIDEHTMLControlIntf) TScrollableHTMLControl = class(TMemo,TIDEHTMLControlIntf)
private private
@ -371,156 +369,6 @@ begin
ErrMsg:=''; ErrMsg:='';
end; end;
function HTMLToCaption(const s: string; MaxLines: integer): string;
var
p: Integer;
EndPos: Integer;
NewTag: String;
Line: Integer;
sp: LongInt;
InHeader: Boolean;
CurTagName: String;
begin
Result:=s;
//debugln(['HTMLToCaption HTML="',Result,'"']);
Line:=1;
p:=1;
// remove UTF8 BOM
if copy(Result,1,3)=UTF8BOM then
Result:=copy(s,4,length(Result));
InHeader:=false; // it could be a snippet
while p<=length(Result) do begin
if Result[p]='<' then begin
// removes html tags
EndPos:=p+1;
if (EndPos<=length(Result)) and (Result[EndPos]='/') then inc(EndPos);
while (EndPos<=length(Result))
and (not (Result[EndPos] in [' ','>','"','/',#9,#10,#13])) do
inc(EndPos);
CurTagName:=UpperCase(copy(Result,p+1,EndPos-p-1));
while (EndPos<=length(Result)) do begin
if Result[EndPos]='"' then begin
// skip " tag
inc(EndPos);
while (EndPos<=length(Result)) and (Result[EndPos]<>'"') do
inc(EndPos);
if EndPos>length(Result) then break;
end;
if (Result[EndPos]='>') then begin
inc(EndPos);
break;
end;
inc(EndPos);
end;
//debugln(['HTMLToCaption CurTagName=',CurTagName,' Tag="',copy(Result,p,EndPos-p),'"']);
if CurTagName='HTML' then
begin
// it's a whole page
InHeader:=true;
end;
if CurTagName='BODY' then
begin
// start of body => ignore header
InHeader:=false;
Result:=copy(Result,EndPos,length(Result));
p:=1;
EndPos:=1;
Line:=1;
end;
if CurTagName='/BODY' then
begin
// end of body
Result:=copy(Result,1,p-1);
break;
end;
if (CurTagName='P') or (CurTagName='/P') then begin
// add a line break if there is not already one
sp:=p;
while (sp>1) and (Result[sp-1] in [' ',#9]) do dec(sp);
if (sp>1) and (not (Result[sp-1] in [#10,#13])) then
CurTagName:='BR';
end;
if (CurTagName='DIV') or (CurTagName='/DIV')
then begin
// add a line break if not in first line
if Line>1 then
CurTagName:='BR';
end;
if CurTagName='BR' then
begin
NewTag:=LineEnding;
if not InHeader then
inc(Line);
if Line>MaxLines then begin
Result:=copy(Result,1,p)+LineEnding+'...';
break;
end;
end
else
NewTag:='';
if NewTag='' then begin
//debugln(['HTMLToCaption deleting tag ',copy(Result,p,EndPos-p)]);
System.Delete(Result,p,EndPos-p);
end
else begin
Result:=copy(Result,1,p-1)+NewTag+copy(Result,EndPos,length(Result));
inc(p,length(NewTag));
end;
end else if Result[p] in [#9,#10,#13] then begin
// replace spaces and newline characters with a single space
EndPos:=p+1;
while (EndPos<=length(Result)) and (Result[EndPos] in [#9,#10,#13]) do
inc(EndPos);
if (p > 1) and not (Result[p-1] in [#9,#10,#13]) then
begin
Result:=copy(Result,1,p-1)+' '+copy(Result,EndPos,length(Result));
inc(p);
end
else
Result:=copy(Result,1,p-1)+copy(Result,EndPos,length(Result));
end else if Result[p]='&' then begin
// special chars: &lt; &gt; &amp; &nbsp;
if (p+2<Length(Result)) and (Result[p+1]='l') and (Result[p+2]='t') and (Result[p+3]=';') then begin
EndPos:=p+4;
Result:=copy(Result,1,p-1)+'<'+copy(Result,EndPos,length(Result));
end else
if (p+2<Length(Result)) and (Result[p+1]='g') and (Result[p+2]='t') and (Result[p+3]=';') then begin
EndPos:=p+4;
Result:=copy(Result,1,p-1)+'>'+copy(Result,EndPos,length(Result));
end else
if (p+4<Length(Result)) and (Result[p+1]='n') and (Result[p+2]='b') and (Result[p+3]='s') and (Result[p+4]='p') and (Result[p+5]=';') then begin
EndPos:=p+6;
Result:=copy(Result,1,p-1)+' '+copy(Result,EndPos,length(Result));
end else
if (p+3<Length(Result)) and (Result[p+1]='a') and (Result[p+2]='m') and (Result[p+3]='p') and (Result[p+4]=';') then begin
EndPos:=p+5;
Result:=copy(Result,1,p-1)+'&'+copy(Result,EndPos,length(Result));
end;
inc(p);
end else
inc(p);
end;
// trim space at end
p:=length(Result);
while (p>0) and (Result[p] in [' ',#9,#10,#13]) do dec(p);
SetLength(Result,p);
//DebugLn(['HTMLToCaption Caption="',dbgstr(Result),'"']);
end;
function HTMLToCaption(Stream: TStream; MaxLines: integer): string;
var
s: string;
begin
SetLength(s,Stream.Size);
if s<>'' then
Stream.Read(s[1],length(s));
Result:=HTMLToCaption(s,MaxLines);
end;
{ TSimpleHTMLControl } { TSimpleHTMLControl }
procedure TSimpleHTMLControl.SetProvider(const AValue: TAbstractIDEHTMLProvider); procedure TSimpleHTMLControl.SetProvider(const AValue: TAbstractIDEHTMLProvider);
@ -549,6 +397,7 @@ end;
procedure TSimpleHTMLControl.SetURL(const AValue: string); procedure TSimpleHTMLControl.SetURL(const AValue: string);
var var
Stream: TStream; Stream: TStream;
Renderer: THTML2TextRenderer;
NewURL: String; NewURL: String;
begin begin
if Provider=nil then raise Exception.Create('TSimpleHTMLControl.SetURL missing Provider'); if Provider=nil then raise Exception.Create('TSimpleHTMLControl.SetURL missing Provider');
@ -558,9 +407,11 @@ begin
FURL:=NewURL; FURL:=NewURL;
try try
Stream:=Provider.GetStream(FURL,true); Stream:=Provider.GetStream(FURL,true);
Renderer:=THTML2TextRenderer.Create(Stream);
try try
Caption:=HTMLToCaption(Stream, MaxLineCount); Caption:=Renderer.Render(MaxLineCount);
finally finally
Renderer.Free;
Provider.ReleaseStream(FURL); Provider.ReleaseStream(FURL);
end; end;
except except
@ -571,10 +422,17 @@ begin
end; end;
procedure TSimpleHTMLControl.SetHTMLContent(Stream: TStream; const NewURL: string); procedure TSimpleHTMLControl.SetHTMLContent(Stream: TStream; const NewURL: string);
var
Renderer: THTML2TextRenderer;
begin begin
FURL:=NewURL; FURL:=NewURL;
Caption:=HTMLToCaption(Stream,MaxLineCount); Renderer:=THTML2TextRenderer.Create(Stream);
//debugln(['TSimpleHTMLControl.SetHTMLContent ',Caption]); try
Caption:=Renderer.Render(MaxLineCount);
finally
Renderer.Free;
end;
//debugln(['TSimpleHTMLControl.SetHTMLContent: ',Caption]);
end; end;
procedure TSimpleHTMLControl.GetPreferredControlSize(out AWidth, AHeight: integer); procedure TSimpleHTMLControl.GetPreferredControlSize(out AWidth, AHeight: integer);
@ -629,6 +487,7 @@ end;
procedure TScrollableHTMLControl.SetURL(const AValue: string); procedure TScrollableHTMLControl.SetURL(const AValue: string);
var var
Stream: TStream; Stream: TStream;
Renderer: THTML2TextRenderer;
NewURL: String; NewURL: String;
begin begin
if Provider=nil then raise Exception.Create('TScrollableHTMLControl.SetURL missing Provider'); if Provider=nil then raise Exception.Create('TScrollableHTMLControl.SetURL missing Provider');
@ -638,9 +497,11 @@ begin
FURL:=NewURL; FURL:=NewURL;
try try
Stream:=Provider.GetStream(FURL,true); Stream:=Provider.GetStream(FURL,true);
Renderer:=THTML2TextRenderer.Create(Stream);
try try
Caption:=HTMLToCaption(Stream, MaxInt); Caption:=Renderer.Render;
finally finally
Renderer.Free;
Provider.ReleaseStream(FURL); Provider.ReleaseStream(FURL);
end; end;
except except
@ -651,10 +512,17 @@ begin
end; end;
procedure TScrollableHTMLControl.SetHTMLContent(Stream: TStream; const NewURL: string); procedure TScrollableHTMLControl.SetHTMLContent(Stream: TStream; const NewURL: string);
var
Renderer: THTML2TextRenderer;
begin begin
FURL:=NewURL; FURL:=NewURL;
Caption:=HTMLToCaption(Stream,MaxInt); Renderer:=THTML2TextRenderer.Create(Stream);
//debugln(['TScrollableHTMLControl.SetHTMLContent ',Caption]); try
Caption:=Renderer.Render;
finally
Renderer.Free;
end;
//debugln(['TScrollableHTMLControl.SetHTMLContent: ',Caption]);
end; end;
procedure TScrollableHTMLControl.GetPreferredControlSize(out AWidth, AHeight: integer); procedure TScrollableHTMLControl.GetPreferredControlSize(out AWidth, AHeight: integer);