Replace function HTMLToCaption with a new THTML2TextRenderer class, placed into LazUtils package. Issue #31991.

git-svn-id: trunk@55319 -
This commit is contained in:
juha 2017-06-11 18:33:28 +00:00
parent b47c467010
commit 1b20d91bba
5 changed files with 286 additions and 163 deletions

1
.gitattributes vendored
View File

@ -2984,6 +2984,7 @@ components/lazutils/fileutil.inc svneol=native#text/pascal
components/lazutils/fileutil.pas svneol=native#text/pascal
components/lazutils/fpcadds.pas svneol=native#text/pascal
components/lazutils/fpmake.pp svneol=native#text/plain
components/lazutils/html2textrender.pas svneol=native#text/pascal
components/lazutils/languages/lazutilsstrconsts.cs.po svneol=native#text/plain
components/lazutils/languages/lazutilsstrconsts.de.po svneol=native#text/plain
components/lazutils/languages/lazutilsstrconsts.es.po svneol=native#text/plain

View File

@ -0,0 +1,250 @@
{
*****************************************************************************
This file is part of LazUtils.
See the file COPYING.modifiedLGPL.txt, included in this distribution,
for details about the license.
*****************************************************************************
Author: Juha Manninen
Abstract:
Render HTML into plain text by using indentation, newlines and stripping tags.
}
unit HTML2TextRender;
{$mode objfpc}{$H+}
interface
uses
Classes, SysUtils,
// LazUtils
LConvEncoding;
type
{ THTML2TextRenderer
renders HTML into plain text as well as possible }
THTML2TextRenderer = class
private
fHTML, fOutput: string;
fMaxLines: integer;
fInHeader: Boolean;
fPendingSpace: Boolean;
fPendingNewLineCnt: Integer;
fCurTag: String;
fIndent: integer;
fLineCnt, fHtmlLen: Integer;
p: Integer;
procedure AddNewLine;
procedure AddOneNewLine;
function AddOutput(aText: String): Boolean;
function HtmlTag: Boolean;
procedure HtmlEntity;
procedure Reset;
public
constructor Create(aHTML: string);
constructor Create(Stream: TStream);
destructor Destroy; override;
function Render(aMaxLines: integer = MaxInt): string;
end;
implementation
{ THTML2TextRenderer }
constructor THTML2TextRenderer.Create(aHTML: string);
begin
fHTML:=aHTML;
// remove UTF8 BOM
if copy(fHTML,1,3)=UTF8BOM then
delete(fHTML,1,3);
end;
constructor THTML2TextRenderer.Create(Stream: TStream);
var
s: string;
begin
SetLength(s,Stream.Size);
if s<>'' then
Stream.Read(s[1],length(s));
Create(s);
end;
destructor THTML2TextRenderer.Destroy;
begin
inherited Destroy;
end;
procedure THTML2TextRenderer.Reset;
begin
fOutput:='';
fInHeader:=False;
fPendingSpace:=False;
fPendingNewLineCnt:=0;
fLineCnt:=1;
end;
procedure THTML2TextRenderer.AddNewLine;
// set a pending linebreak to be added later
begin
if (fOutput<>'') and not fInHeader then
Inc(fPendingNewLineCnt);
end;
procedure THTML2TextRenderer.AddOneNewLine;
begin
if (fOutput<>'') and not fInHeader then
fPendingNewLineCnt:=1;
end;
function THTML2TextRenderer.AddOutput(aText: String): Boolean;
var
i: Integer;
begin
Result:=True;
if fPendingSpace and (fPendingNewLineCnt=0) then
fOutput:=fOutput+' '; // Don't add space at end of line (before newline)
fPendingSpace:=False;
for i:=0 to fPendingNewLineCnt-1 do
begin
fOutput:=fOutput+LineEnding;
Inc(fLineCnt);
// Return False if max # of lines exceeded.
if fLineCnt>fMaxLines then
begin
fOutput:=fOutput+LineEnding+'...';
Exit(False);
end;
end;
if fPendingNewLineCnt>0 then
begin
fOutput:=fOutput+StringOfChar(' ',fIndent*2);
fPendingNewLineCnt:=0;
end;
fOutput:=fOutput+aText;
end;
function THTML2TextRenderer.HtmlTag: Boolean;
// separate a html tag and use it for layout. '<' is already found here.
// ToDo: parse <div class="title"> and use it.
var
Start: Integer;
begin
// first separate a html tag.
inc(p);
Start:=p;
if (p<=fHtmlLen) and (fHTML[p]='/') then
inc(p);
while (p<=fHtmlLen) and not (fHTML[p] in [' ','>','"','/',#9,#10,#13]) do
inc(p);
fCurTag:=UpperCase(copy(fHTML,Start,p-Start));
while p<=fHtmlLen do
begin
if fHTML[p]='"' then begin // skip attribute "value" inside tag
inc(p);
while (p<=fHtmlLen) and (fHTML[p]<>'"') do
inc(p);
if p>fHtmlLen then break;
end;
inc(p);
if (fHTML[p-1]='>') then break; // end of tag
end;
// adjust layout based on html tag, then remove it
Result:=True;
case fCurTag of
'HTML':
fInHeader:=True; // it's a whole page
'BODY':
Reset; // start of body => ignore header and all its data
'P', '/P', 'BR', '/UL':
AddNewLine;
'DIV': begin
AddOneNewLine;
Inc(fIndent);
end;
'/DIV': begin
AddOneNewLine;
Dec(fIndent);
end;
'LI':
begin
Inc(fIndent);
// Don't leave empty lines before list item (not sure if this is good)
AddOneNewLine;
Result:=AddOutput('* '); // Can return False if MaxLines was exceeded
end;
'/LI':
Dec(fIndent);
'A', '/A': // Link
Result:=AddOutput(' _ ');
'HR':
begin
AddOneNewLine;
Result:=AddOutput('----------');
//AddOneNewLine;
end;
end;
end;
procedure THTML2TextRenderer.HtmlEntity;
// entities: &lt; &gt; &amp; &nbsp;
begin
if (p+2<fHtmlLen) and (fHTML[p+1]='l') and (fHTML[p+2]='t') and (fHTML[p+3]=';') then
begin
Inc(p,4);
AddOutput('<');
end else
if (p+2<fHtmlLen) and (fHTML[p+1]='g') and (fHTML[p+2]='t') and (fHTML[p+3]=';') then
begin
Inc(p,4);
AddOutput('>');
end else
if (p+4<fHtmlLen) and (fHTML[p+1]='n') and (fHTML[p+2]='b') and (fHTML[p+3]='s') and (fHTML[p+4]='p') and (fHTML[p+5]=';') then
begin
Inc(p,6);
AddOutput(' ');
end else
if (p+3<fHtmlLen) and (fHTML[p+1]='a') and (fHTML[p+2]='m') and (fHTML[p+3]='p') and (fHTML[p+4]=';') then
begin
Inc(p,5);
AddOutput('&');
end;
end;
function THTML2TextRenderer.Render(aMaxLines: integer): string;
// Parse the HTML and render to plain text.
// Output is limited to aMaxLines lines.
var
OkToGo: Boolean;
begin
fMaxLines:=aMaxLines;
Reset;
p:=1;
OkToGo:=True;
fHtmlLen:=length(fHTML);
while (p<=fHtmlLen) and OkToGo do
begin
case fHTML[p] of
'<': OkToGo:=HtmlTag; // Can return False if MaxLines was exceeded.
'&': HtmlEntity;
' ',#9,#10,#13: // WhiteSpace
begin
fPendingSpace:=True;
inc(p);
end;
else
begin
AddOutput(fHTML[p]); // Add text verbatim.
inc(p);
end;
end;
end;
Result:=fOutput;
end;
end.

View File

@ -16,7 +16,7 @@
<Description Value="Useful units for Lazarus packages."/>
<License Value="Modified LGPL-2"/>
<Version Major="1"/>
<Files Count="91">
<Files Count="92">
<Item1>
<Filename Value="LazLoggerImpl.inc"/>
<Type Value="Include"/>
@ -381,6 +381,10 @@
<Filename Value="winlazutf8.inc"/>
<Type Value="Include"/>
</Item91>
<Item92>
<Filename Value="html2textrender.pas"/>
<UnitName Value="HTML2TextRender"/>
</Item92>
</Files>
<LazDoc Paths="../../docs/xml/lazutils"/>
<i18n>

View File

@ -20,7 +20,7 @@ uses
LConvEncoding, lcsvutils, LookupStringList, Maps, Masks, PasWString,
StringHashList, TextStrings, Translations, TTCache, TTCalc, TTCMap, TTDebug,
TTError, TTFile, TTGLoad, TTInterp, TTLoad, TTMemory, TTObjs, TTProfile,
TTRASTER, TTTables, TTTypes, UTF8Process, LazarusPackageIntf;
TTRASTER, TTTables, TTTypes, UTF8Process, HTML2TextRender, LazarusPackageIntf;
implementation

View File

@ -38,7 +38,7 @@ uses
LCLProc, LCLIntf, LCLType, FileProcs, Forms, Controls, ComCtrls, StdCtrls,
Dialogs, Graphics, Buttons, ButtonPanel,
// LazUtils
LConvEncoding, LazUTF8Classes, LazFileUtils,
LConvEncoding, LazUTF8Classes, LazFileUtils, HTML2TextRender,
// CodeTools
BasicCodeTools, CodeToolManager, CodeCache, CustomCodeTool, CodeTree,
PascalParserTool, FindDeclarationTool,
@ -117,8 +117,7 @@ type
procedure ReleaseStream(const URL: string);
end;
{ TSimpleHTMLControl
At the moment it is a TLabel that simply strips all tags }
{ TSimpleHTMLControl }
TSimpleHTMLControl = class(TLabel,TIDEHTMLControlIntf)
private
@ -136,8 +135,7 @@ type
property MaxLineCount: integer read FMaxLineCount write FMaxLineCount;
end;
{ TScrollableHTMLControl
At the moment it is a TMemo that simply strips all tags }
{ TScrollableHTMLControl }
TScrollableHTMLControl = class(TMemo,TIDEHTMLControlIntf)
private
@ -371,156 +369,6 @@ begin
ErrMsg:='';
end;
function HTMLToCaption(const s: string; MaxLines: integer): string;
var
p: Integer;
EndPos: Integer;
NewTag: String;
Line: Integer;
sp: LongInt;
InHeader: Boolean;
CurTagName: String;
begin
Result:=s;
//debugln(['HTMLToCaption HTML="',Result,'"']);
Line:=1;
p:=1;
// remove UTF8 BOM
if copy(Result,1,3)=UTF8BOM then
Result:=copy(s,4,length(Result));
InHeader:=false; // it could be a snippet
while p<=length(Result) do begin
if Result[p]='<' then begin
// removes html tags
EndPos:=p+1;
if (EndPos<=length(Result)) and (Result[EndPos]='/') then inc(EndPos);
while (EndPos<=length(Result))
and (not (Result[EndPos] in [' ','>','"','/',#9,#10,#13])) do
inc(EndPos);
CurTagName:=UpperCase(copy(Result,p+1,EndPos-p-1));
while (EndPos<=length(Result)) do begin
if Result[EndPos]='"' then begin
// skip " tag
inc(EndPos);
while (EndPos<=length(Result)) and (Result[EndPos]<>'"') do
inc(EndPos);
if EndPos>length(Result) then break;
end;
if (Result[EndPos]='>') then begin
inc(EndPos);
break;
end;
inc(EndPos);
end;
//debugln(['HTMLToCaption CurTagName=',CurTagName,' Tag="',copy(Result,p,EndPos-p),'"']);
if CurTagName='HTML' then
begin
// it's a whole page
InHeader:=true;
end;
if CurTagName='BODY' then
begin
// start of body => ignore header
InHeader:=false;
Result:=copy(Result,EndPos,length(Result));
p:=1;
EndPos:=1;
Line:=1;
end;
if CurTagName='/BODY' then
begin
// end of body
Result:=copy(Result,1,p-1);
break;
end;
if (CurTagName='P') or (CurTagName='/P') then begin
// add a line break if there is not already one
sp:=p;
while (sp>1) and (Result[sp-1] in [' ',#9]) do dec(sp);
if (sp>1) and (not (Result[sp-1] in [#10,#13])) then
CurTagName:='BR';
end;
if (CurTagName='DIV') or (CurTagName='/DIV')
then begin
// add a line break if not in first line
if Line>1 then
CurTagName:='BR';
end;
if CurTagName='BR' then
begin
NewTag:=LineEnding;
if not InHeader then
inc(Line);
if Line>MaxLines then begin
Result:=copy(Result,1,p)+LineEnding+'...';
break;
end;
end
else
NewTag:='';
if NewTag='' then begin
//debugln(['HTMLToCaption deleting tag ',copy(Result,p,EndPos-p)]);
System.Delete(Result,p,EndPos-p);
end
else begin
Result:=copy(Result,1,p-1)+NewTag+copy(Result,EndPos,length(Result));
inc(p,length(NewTag));
end;
end else if Result[p] in [#9,#10,#13] then begin
// replace spaces and newline characters with a single space
EndPos:=p+1;
while (EndPos<=length(Result)) and (Result[EndPos] in [#9,#10,#13]) do
inc(EndPos);
if (p > 1) and not (Result[p-1] in [#9,#10,#13]) then
begin
Result:=copy(Result,1,p-1)+' '+copy(Result,EndPos,length(Result));
inc(p);
end
else
Result:=copy(Result,1,p-1)+copy(Result,EndPos,length(Result));
end else if Result[p]='&' then begin
// special chars: &lt; &gt; &amp; &nbsp;
if (p+2<Length(Result)) and (Result[p+1]='l') and (Result[p+2]='t') and (Result[p+3]=';') then begin
EndPos:=p+4;
Result:=copy(Result,1,p-1)+'<'+copy(Result,EndPos,length(Result));
end else
if (p+2<Length(Result)) and (Result[p+1]='g') and (Result[p+2]='t') and (Result[p+3]=';') then begin
EndPos:=p+4;
Result:=copy(Result,1,p-1)+'>'+copy(Result,EndPos,length(Result));
end else
if (p+4<Length(Result)) and (Result[p+1]='n') and (Result[p+2]='b') and (Result[p+3]='s') and (Result[p+4]='p') and (Result[p+5]=';') then begin
EndPos:=p+6;
Result:=copy(Result,1,p-1)+' '+copy(Result,EndPos,length(Result));
end else
if (p+3<Length(Result)) and (Result[p+1]='a') and (Result[p+2]='m') and (Result[p+3]='p') and (Result[p+4]=';') then begin
EndPos:=p+5;
Result:=copy(Result,1,p-1)+'&'+copy(Result,EndPos,length(Result));
end;
inc(p);
end else
inc(p);
end;
// trim space at end
p:=length(Result);
while (p>0) and (Result[p] in [' ',#9,#10,#13]) do dec(p);
SetLength(Result,p);
//DebugLn(['HTMLToCaption Caption="',dbgstr(Result),'"']);
end;
function HTMLToCaption(Stream: TStream; MaxLines: integer): string;
var
s: string;
begin
SetLength(s,Stream.Size);
if s<>'' then
Stream.Read(s[1],length(s));
Result:=HTMLToCaption(s,MaxLines);
end;
{ TSimpleHTMLControl }
procedure TSimpleHTMLControl.SetProvider(const AValue: TAbstractIDEHTMLProvider);
@ -549,6 +397,7 @@ end;
procedure TSimpleHTMLControl.SetURL(const AValue: string);
var
Stream: TStream;
Renderer: THTML2TextRenderer;
NewURL: String;
begin
if Provider=nil then raise Exception.Create('TSimpleHTMLControl.SetURL missing Provider');
@ -558,9 +407,11 @@ begin
FURL:=NewURL;
try
Stream:=Provider.GetStream(FURL,true);
Renderer:=THTML2TextRenderer.Create(Stream);
try
Caption:=HTMLToCaption(Stream, MaxLineCount);
Caption:=Renderer.Render(MaxLineCount);
finally
Renderer.Free;
Provider.ReleaseStream(FURL);
end;
except
@ -571,10 +422,17 @@ begin
end;
procedure TSimpleHTMLControl.SetHTMLContent(Stream: TStream; const NewURL: string);
var
Renderer: THTML2TextRenderer;
begin
FURL:=NewURL;
Caption:=HTMLToCaption(Stream,MaxLineCount);
//debugln(['TSimpleHTMLControl.SetHTMLContent ',Caption]);
Renderer:=THTML2TextRenderer.Create(Stream);
try
Caption:=Renderer.Render(MaxLineCount);
finally
Renderer.Free;
end;
//debugln(['TSimpleHTMLControl.SetHTMLContent: ',Caption]);
end;
procedure TSimpleHTMLControl.GetPreferredControlSize(out AWidth, AHeight: integer);
@ -629,6 +487,7 @@ end;
procedure TScrollableHTMLControl.SetURL(const AValue: string);
var
Stream: TStream;
Renderer: THTML2TextRenderer;
NewURL: String;
begin
if Provider=nil then raise Exception.Create('TScrollableHTMLControl.SetURL missing Provider');
@ -638,9 +497,11 @@ begin
FURL:=NewURL;
try
Stream:=Provider.GetStream(FURL,true);
Renderer:=THTML2TextRenderer.Create(Stream);
try
Caption:=HTMLToCaption(Stream, MaxInt);
Caption:=Renderer.Render;
finally
Renderer.Free;
Provider.ReleaseStream(FURL);
end;
except
@ -651,10 +512,17 @@ begin
end;
procedure TScrollableHTMLControl.SetHTMLContent(Stream: TStream; const NewURL: string);
var
Renderer: THTML2TextRenderer;
begin
FURL:=NewURL;
Caption:=HTMLToCaption(Stream,MaxInt);
//debugln(['TScrollableHTMLControl.SetHTMLContent ',Caption]);
Renderer:=THTML2TextRenderer.Create(Stream);
try
Caption:=Renderer.Render;
finally
Renderer.Free;
end;
//debugln(['TScrollableHTMLControl.SetHTMLContent: ',Caption]);
end;
procedure TScrollableHTMLControl.GetPreferredControlSize(out AWidth, AHeight: integer);