lazarus/components/wiki/lazwiki/wikihtmlindexer.pas
mattias ec110f77c0 started some tools for the wiki
git-svn-id: trunk@35615 -
2012-02-28 19:22:41 +00:00

504 lines
13 KiB
ObjectPascal

{ Copyright (C) <2008> <Andrew Haines> htmlindexer.pas
This library is free software; you can redistribute it and/or modify it
under the terms of the GNU Library General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at your
option) any later version.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License
for more details.
You should have received a copy of the GNU Library General Public License
along with this library; if not, write to the Free Software Foundation,
Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
}
{
See the file COPYING.FPC, included in this distribution,
for details about the copyright.
}
unit wikihtmlindexer;
{$MODE OBJFPC}{$H+}
interface
uses Classes, SysUtils, FastHTMLParser,{$ifdef userb}fos_redblacktree_gen{$else}avl_tree{$endif};
Type
{ TIndexDocument }
TIndexDocument = class(TObject)
private
FDocumentIndex: Integer;
FLastEntry : Integer;
WordIndex: array of Integer;
function getindexentries:integer;
public
function GetWordIndex(i:integer):integer; inline;
procedure AddWordIndex(AIndex: Integer);
constructor Create(ADocumentIndex: Integer);
property DocumentIndex: Integer read FDocumentIndex;
property IndexEntry[i:integer] : Integer read GetWordIndex;
property NumberofIndexEntries : integer read getindexentries;
end;
{ TIndexedWord }
TIndexedWord = class(TObject)
private
FIsTitle: Boolean;
FTheWord: string;
FCachedTopic: TIndexDocument;
FDocuments: Array of TIndexDocument;
function GetDocument ( TopicIndexNum: Integer ) : TIndexDocument;
function GetDocumentCount: Integer;
public
constructor Create(AWord: String; AIsTitle: Boolean);
destructor Destroy; override;
function GetLogicalDocument(AIndex: Integer): TIndexDocument;
property TheWord: string read FTheWord write ftheword; // Always lowercase
property DocumentTopic[TopicIndexNum: Integer]: TIndexDocument read GetDocument;
property DocumentCount: Integer read GetDocumentCount;
property IsTitle: Boolean read FIsTitle write fistitle;
end;
{ TIndexedWordList }
{$ifdef userb}
TRBIndexTree = specialize TGFOS_RBTree<String,TIndexedWord>;
{$endif}
TForEachMethod = procedure (AWord:TIndexedWord) of object;
TForEachProcedure = Procedure (AWord:TIndexedWord;state:pointer);
TIndexedWordList = class(TObject)
private
FIndexTitlesOnly: Boolean;
FIndexedFileCount: DWord;
//vars while processing page
FInTitle,
FInBody: Boolean;
FWordCount: Integer; // only words in body
FDocTitle: String;
FTopicIndex: Integer;
//end vars
FTotalDifferentWordLength: DWord;
FTotalDIfferentWords: DWord;
FTotalWordCount: DWord;
FTotalWordLength: DWord;
FLongestWord: DWord;
FParser: THTMLParser;
{$ifdef userb}
FAVLTree : TRBIndexTree;
{$else}
FAVLTree : TAVLTree;
Spare :TIndexedWord;
{$endif}
function AddGetWord(AWord: String; IsTitle: Boolean): TIndexedWord;
// callbacks
procedure CBFoundTag(NoCaseTag, ActualTag: string);
procedure CBFountText(Text: string);
procedure EatWords(Words: String; IsTitle: Boolean);
public
constructor Create;
destructor Destroy; override;
function IndexFile(AStream: TStream; ATOPICIndex: Integer; AIndexOnlyTitles: Boolean): String; // returns the documents <Title>
procedure Clear;
procedure AddWord(const AWord: TIndexedWord);
procedure ForEach(Proc:TForEachMethod);
procedure ForEach(Proc:TForEachProcedure;state:pointer);
property IndexedFileCount: DWord read FIndexedFileCount;
property LongestWord: DWord read FLongestWord;
property TotalWordCount: DWord read FTotalWordCount;
property TotalDIfferentWords: DWord read FTotalDIfferentWords;
property TotalWordLength: DWord read FTotalWordLength;
property TotalDifferentWordLength: DWord read FTotalDifferentWordLength;
property Words[AWord: String; IsTitle: Boolean] : TIndexedWord read AddGetWord;
end;
implementation
Const GrowSpeed = 10;
function Max(ANumber, BNumber: DWord): DWord;
begin
if ANumber > BNumber then
Result := ANumber
else
Result := BNumber;
end;
const titlexlat : array [boolean] of char = ('0','1');
function makekey( n : string;istitle:boolean):string; inline;
begin
result:=n+'___'+titlexlat[istitle];
end;
Function CompareProcObj(Node1, Node2: Pointer): integer;
var n1,n2 : TIndexedWord;
begin
n1:=TIndexedWord(Node1); n2:=TIndexedWord(Node2);
Result := CompareText(n1.theword, n2.theword);
if Result = 0 then
begin
Result := ord(n2.IsTitle)-ord(n1.IsTitle);
end;
if Result < 0 then Result := -1
else if Result > 0 then Result := 1;
end;
{ TIndexedWordList }
function TIndexedWordList.AddGetWord(AWord: String; IsTitle: Boolean): TIndexedWord;
var
{$ifdef userb}
key : string;
{$else}
n : TAVLTreeNode;
{$endif}
begin
Result := nil;
AWord := LowerCase(AWord);
{$ifdef userb}
key:=makekey(aword,istitle);
if not favltree.Find(key,result) then result:=nil;;
{$else}
if not assigned(spare) then
spare:=TIndexedWord.Create(AWord,IsTitle)
else
begin
spare.TheWord:=aword;
spare.IsTitle:=IsTitle;
end;
n:=favltree.FindKey(Spare,@CompareProcObj);
if assigned(n) then
result:=TIndexedWord(n.Data);
{$endif}
if Result = nil then
begin
Inc(FTotalDifferentWordLength, Length(AWord));
Inc(FTotalDIfferentWords);
{$ifdef userb}
result:=TIndexedWord.Create(AWord,IsTitle);
favltree.add(key,result);
{$else}
Result := spare; // TIndexedWord.Create(AWord,IsTitle);
spare:=nil;
AddWord(Result);
{$endif}
// if IsTitle then
//WriteLn('Creating word: ', AWord);
FLongestWord := Max(FLongestWord, Length(AWord));
end;
Inc(FTotalWordLength, Length(AWord));
Inc(FTotalWordCount);
end;
procedure TIndexedWordList.CBFoundTag(NoCaseTag, ActualTag: string);
begin
if FInBody then begin
if NoCaseTag = '</BODY>' then FInBody := False;
end
else begin
//WriteLn('"',NoCaseTag,'"');
if NoCaseTag = '<TITLE>' then FInTitle := True
else if NoCaseTag = '</TITLE>' then FInTitle := False
else if NoCaseTag = '<BODY>' then FInBody := True
else
end;
if FInBody and FIndexTitlesOnly then FParser.Done := True;
end;
procedure TIndexedWordList.CBFountText(Text: string);
begin
if Length(Text) < 1 then
Exit;
EatWords(Text, FInTitle and not FInBody);
end;
procedure TIndexedWordList.EatWords ( Words: String; IsTitle: Boolean ) ;
var
WordPtr: PChar;
WordStart: PChar;
InWord: Boolean;
IsNumberWord: Boolean;
function IsEndOfWord: Boolean;
begin
Result := not (WordPtr^ in ['a'..'z', '0'..'9', #01, #$DE, #$FE]);
if Result and IsNumberWord then
Result := Result and (WordPtr[0] <> '.');
if Result and InWord then
Result := Result and (WordPtr[0] <> '''');
;
end;
var
WordIndex: TIndexedWord;
WordName: String;
FPos: Integer;
begin
if IsTitle then
FDocTitle := Words;
Words := LowerCase(Words);
WordStart := PChar(Words);
WordPtr := WordStart;
IsNumberWord := False;
InWord := False;
repeat
if InWord and IsEndOfWord then
begin
WordName := Copy(WordStart, 0, (WordPtr-WordStart));
FPos := Pos('''', WordName);
while FPos > 0 do
begin
Delete(WordName, FPos, 1);
FPos := Pos('''', WordName);
end;
WordIndex := addgetword(wordname,istitle);
InWord := False;
IsNumberWord := False;
WordIndex.DocumentTopic[FTopicIndex].AddWordIndex(FWordCount);
//if not IsTitle then
Inc(FWordCount);
end
else if not InWord and not IsEndOfWord then
begin
InWord := True;
WordStart := WordPtr;
IsNumberWord := WordPtr^ in ['0'..'9'];
end;
Inc(WordPtr);
until WordPtr^ = #0;
if InWord then
begin
WordName := Copy(WordStart, 0, (WordPtr-WordStart));
try
WordIndex := addgetword(wordname,istitle); // Self.Words[WordName, IsTitle];
except on e:exception do writeln(wordname); end;
WordIndex.DocumentTopic[FTopicIndex].AddWordIndex(FWordCount);
InWord := False;
//if IsNumberWord then WriteLn('Following is NUMBER WORD: "', (WordStart[0]),'"'); ;
IsNumberWord := False;
//WriteLn(FWordCount, ' "', WordName,'"');
if not IsTitle then
Inc(FWordCount);
end;
end;
function defaultindexedword : TIndexedWord;
begin
result:=Tindexedword.create('',false);
end;
constructor TIndexedWordList.Create;
begin
inherited;
{$ifdef userb}
FAVLTree :=TRBIndexTree.create(@default_rb_string_compare,
@defaultindexedword,
@default_rb_string_undef );
{$else}
favltree:=TAVLTree.Create(@CompareProcObj);
spare:=nil;
{$endif}
end;
procedure FreeObject(const Obj:TIndexedWord);
begin
obj.free;
end;
destructor TIndexedWordList.Destroy;
begin
clear;
{$ifndef userb}
if assigned(spare) then spare.free;
{$endif}
favltree.free;
inherited Destroy;
end;
function TIndexedWordList.IndexFile(AStream: TStream; ATOPICIndex: Integer; AIndexOnlyTitles: Boolean): String;
var
TheFile: String;
begin
FInBody := False;
FInTitle:= False;
FIndexTitlesOnly := AIndexOnlyTitles;
FWordCount := 0;
FTopicIndex := ATOPICIndex;
FIndexedFileCount := FIndexedFileCount +1;
SetLength(TheFile, AStream.Size+1);
AStream.Position := 0;
AStream.Read(TheFile[1], AStream.Size);
TheFile[Length(TheFile)] := #0;
FParser := THTMLParser.Create(@TheFile[1]);
FParser.OnFoundTag := @CBFoundTag;
FParser.OnFoundText := @CBFountText;
FParser.Exec;
FParser.Free;
Result := FDocTitle;
FDocTitle := '';
FInBody := False;
FInTitle:= False;
FWordCount := 0;
FTopicIndex := -1;
AStream.Position := 0;
end;
procedure TIndexedWordList.Clear;
begin
{$ifdef userb}
fAvlTree.ClearN(@FreeObject);
{$else}
fAvlTree.FreeAndClear;
{$endif}
end;
procedure TIndexedWordList.AddWord(const AWord: TIndexedWord);
begin
{$ifdef userb}
favltree.add(makekey(aword.theword,aword.istitle),AWord);
{$else}
favltree.add(aword);
{$endif}
end;
procedure TIndexedWordList.ForEach(Proc:TForEachMethod);
{$ifdef userb}
var key : string;
val:TIndexedWord;
{$else}
var
AVLNode : TAVLTreeNode;
{$endif}
begin
{$ifdef userb}
if favltree.FirstNode(key,val) then
begin // Scan it forward
repeat
proc(val);
until not favltree.FindNext(key,val);
end;
{$else}
AVLNode:=fAVLTree.FindLowest;
while (AVLNode<>nil) do
begin
Proc(TIndexedWord(AVLNode.Data));
AVLNode:=FAVLTree.FindSuccessor(AVLNode)
end;
{$endif}
end;
procedure TIndexedWordList.ForEach(Proc:TForEachProcedure;state:pointer);
{$ifdef userb}
var key : string;
val:TIndexedWord;
{$else}
var
AVLNode : TAVLTreeNode;
{$endif}
begin
{$ifdef userb}
if favltree.FirstNode(key,val) then
begin // Scan it forward
repeat
proc(val,state);
until not favltree.FindNext(key,val);
end;
{$else}
AVLNode:=fAVLTree.FindLowest;
while (AVLNode<>nil) do
begin
Proc(TIndexedWord(AVLNode.Data),State);
AVLNode:=FAVLTree.FindSuccessor(AVLNode)
end;
{$endif}
end;
{ TIndexedWord }
function TIndexedWord.GetDocument ( TopicIndexNum: Integer ) : TIndexDocument;
var
i: Integer;
begin
Result := nil;
if (FCachedTopic <> nil) and (FCachedTopic.FDocumentIndex = TopicIndexNum) then
Exit(FCachedTopic);
for i := 0 to High(FDocuments) do
if FDocuments[i].FDocumentIndex = TopicIndexNum then
Exit(FDocuments[i]);
if Result = nil then
begin
Result := TIndexDocument.Create(TopicIndexNum);
SetLength(FDocuments, Length(FDocuments)+1);
FDocuments[High(FDocuments)] := Result;
end;
FCachedTopic := Result;
end;
function TIndexedWord.GetDocumentCount: Integer;
begin
Result := Length(FDocuments);
end;
constructor TIndexedWord.Create(AWord: String; AIsTitle: Boolean);
begin
FTheWord := AWord;
FIsTitle := AIsTitle;
end;
destructor TIndexedWord.Destroy;
var
i: Integer;
begin
// here the word removed itself from the linked list. But it can't
// touch the AVL tree here.
for i := 0 to High(FDocuments) do
FreeAndNil(FDocuments[i]);
inherited Destroy;
end;
function TIndexedWord.GetLogicalDocument ( AIndex: Integer ) : TIndexDocument;
begin
Result := FDocuments[AIndex];;
end;
{ TIndexDocument }
procedure TIndexDocument.AddWordIndex ( AIndex: Integer ) ;
begin
if FLastEntry>=Length(WordIndex) Then
SetLength(WordIndex, Length(WordIndex)+GrowSpeed);
WordIndex[FLastEntry] := AIndex;
Inc(FLastEntry);
end;
constructor TIndexDocument.Create ( ADocumentIndex: Integer ) ;
begin
FDocumentIndex := ADocumentIndex;
flastentry:=0;
end;
function TIndexDocument.GetWordIndex(i:integer):integer;
begin
result:=WordIndex[i];
end;
function TIndexDocument.getindexentries:integer;
begin
result:=flastentry-1;
end;
end.