fpc/packages/chm/src/chmfiftimain.pas

{ Copyright (C) <2008> <Andrew Haines> chmfiftimain.pas

  This library is free software; you can redistribute it and/or modify it
  under the terms of the GNU Library General Public License as published by
  the Free Software Foundation; either version 2 of the License, or (at your
  option) any later version.

  This program is distributed in the hope that it will be useful, but WITHOUT
  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License
  for more details.

  You should have received a copy of the GNU Library General Public License
  along with this library; if not, write to the Free Software Foundation,
  Inc., i51 Franklin Street, Fifth Floor, Boston, MA 02111-1301, USA.
}
{
  See the file COPYING.FPC, included in this distribution,
  for details about the copyright.
}
{$IFNDEF FPC_DOTTEDUNITS}
unit chmFiftiMain;
{$ENDIF FPC_DOTTEDUNITS}
{$mode objfpc}{$H+}
interface

{$IFDEF FPC_DOTTEDUNITS}
uses System.Classes, Chm.HtmlIndexer;
{$ELSE FPC_DOTTEDUNITS}
uses Classes, HTMLIndexer;
{$ENDIF FPC_DOTTEDUNITS}

type
  TFiftiMainHeader = record
    Sig: array [0..3] of byte;  //$00,$00,$28,$00
    HTMLFilesCount: DWord;
    RootNodeOffset: DWord;
    Unknown1: DWord; // = 0
    LeafNodeCount: DWord;
    CopyOfRootNodeOffset: DWord;
    TreeDepth: Word;
    Unknown2: DWord; // = 7
    DocIndexScale: Byte;
    DocIndexRootSize: Byte;
    CodeCountScale: Byte;
    CodeCountRootSize: Byte;
    LocationCodeScale: Byte;
    LocationCodeRootSize: Byte;
    Unknown3: array[0..9] of byte; // = 0
    NodeSize: DWord; // 4096;
    Unknown4: DWord; // 0 or 1;
    LastDupWordIndex: DWord;
    LastDupCharIndex: DWord;
    LongestWordLength: DWord; // maximum 99
    TotalWordsIndexed: DWord; // includes duplicates
    TotalWords: DWord; // word count not including duplicates
    TotalWordsLengthPart1: DWord; // length of all the words with duplicates plus the next dword!
    TotalWordsLengthPart2: DWord;
    TotalWordsLength: DWord; // length of all words not including duplicates
    WordBlockUnusedBytes: DWord; // who knows, this makes no sense when there are more than one blocks
    Unknown5: DWord; // 0
    HTMLFilesCountMinusOne: DWord; // maybe
    Unknown6: array[0..23] of Byte; // 0
    WindowsCodePage: DWord; // usually 1252
    LocalID: DWord;
    //Unknown7: array [0..893] of Byte; // 0
  end;

  { TFIftiNode }

  TFIftiNode = class(TObject)
    FLastWord: String;
    FWriteStream: TStream;
    FBlockStream: TMemoryStream;
    ParentNode: TFIftiNode;
    OwnsParentNode : boolean;
    function  AdjustedWord(AWord: String; out AOffset: Byte; AOldWord: String): String;
    procedure ChildIsFull(AWord: String; ANodeOffset: DWord); virtual; abstract;
    function  GuessIfCanHold(AWord: String): Boolean; virtual; abstract;
    procedure Flush(NewBlockNeeded: Boolean); virtual; abstract;
    procedure FillRemainingSpace;
    function  RemainingSpace: DWord;
    constructor Create(AStream: TStream);
    destructor Destroy; override;
  end;

  { TChmSearchWriter }

  TChmSearchWriter = class(TObject)
  private
    FHeaderRec: TFiftiMainHeader;
    FStream: TStream;
    FWordList: TIndexedWordList;
    FActiveLeafNode: TFIftiNode;
    function GetHasData: Boolean;
    procedure ProcessWords;
    procedure WriteHeader(IsPlaceHolder: Boolean);
    procedure WriteAWord(AWord: TIndexedWord);
  public
    procedure WriteToStream;
    property  HasData: Boolean read GetHasData;
    constructor Create(AStream: TStream; AWordList: TIndexedWordList);
    destructor Destroy; override;
  end;

  { TChmSearchReader }

  TChmWLCTopic = record
    TopicIndex: DWord;
    LocationCodes: array of DWord;
  end;

  TChmWLCTopicArray = array of TChmWLCTopic;

  TChmSearchReader = class;

  TChmSearchReaderFoundDataEvent = procedure(Sender: TChmSearchReader; AWord: String; ATopic: DWord; AWordIndex: DWord) of object;

  TChmSearchReader = class(TObject)
  private
    FStream: TStream;
    FFileIsValid: Boolean;
    FFreeStreamOnDestroy: Boolean;
    FDocRootSize,
    FCodeCountRootSize,
    FLocCodeRootSize: Integer;
    FTreeDepth: Integer;
    FRootNodeOffset: DWord;
    FActiveNodeStart: DWord;
    FActiveNodeFreeSpace: Word;
    FNextLeafNode: DWord;
    procedure ReadCommonData;
    procedure MoveToFirstLeafNode;
    procedure MoveToRootNode;
    procedure MoveToNode(ANodeOffset: DWord; ANodeDepth: Integer);
    function  ReadWordOrPartialWord(ALastWord: String): String; // returns the whole word using the last word as a base
    function  ReadIndexNodeEntry(ALastWord: String; out AWord: String; out ASubNodeStart: DWord): Boolean;
    function  ReadLeafNodeEntry(ALastWord: String; out AWord: String; out AInTitle: Boolean; out AWLCCount: DWord; out AWLCOffset: DWord; out AWLCSize: DWord): Boolean;
    function  ReadWLCEntries(AWLCCount: DWord; AWLCOffset: DWord; AWLCSize: DWord): TChmWLCTopicArray;
  public
    constructor Create(AStream: TStream; AFreeStreamOnDestroy: Boolean);
    destructor  Destroy; override;
    procedure   DumpData(AFoundDataEvent: TChmSearchReaderFoundDataEvent);
    function    LookupWord(AWord: String; out ATitleHits: TChmWLCTopicArray; AStartsWith: Boolean = True): TChmWLCTopicArray;
    property    FileIsValid: Boolean read FFileIsValid;
  end;

const
  FIFTI_NODE_SIZE = 4096;

implementation
{$IFDEF FPC_DOTTEDUNITS}
uses System.SysUtils, System.Math, Chm.Base;
{$ELSE FPC_DOTTEDUNITS}
uses SysUtils, Math, ChmBase;
{$ENDIF FPC_DOTTEDUNITS}

type

{ TIndexNode }

  TIndexNode = class(TFIftiNode)
    function  GuessIfCanHold(AWord: String): Boolean; override;
    procedure ChildIsFull ( AWord: String; ANodeOffset: DWord ); override;
    procedure Flush(NewBlockNeeded: Boolean); override;
  end;

  { TLeafNode }

  TLeafNode = class(TFIftiNode)
    FLeafNodeCount: DWord;
    FLastNodeStart: DWord;
    FreeSpace: DWord;
    FDocRootSize,
    FCodeRootSize,
    FLocRootSize: Byte;
    procedure WriteInitialHeader;
    Destructor Destroy; override;
    function  GuessIfCanHold(AWord: String): Boolean; override;
    procedure Flush(NewBlockNeeded: Boolean); override;
    procedure AddWord(AWord: TIndexedWord);
    function WriteWLCEntries(AWord: TIndexedWord; ADocRootSize, ACodeRootSize, ALocRootSize: Byte): DWord;
    property LeafNodeCount: DWord read FLeafNodeCount;
    property DocRootSize: Byte read FDocRootSize write FDocRootSize;
    property CodeRootSize: Byte read FCodeRootSize write FCodeRootSize;
    property LocRootSize: Byte read FLocRootSize write FLocRootSize;
  end;

function GetCompressedIntegerBE(Stream: TStream): DWord;
var
  Buf: Byte;
  Value: Dword = 0;
  Shift: Integer = 0;
begin
  repeat
    Buf := Stream.ReadByte;
    Value := Value or  (Buf and $7F) shl Shift;
    Inc(Shift, 7);
  until (Buf and $80) = 0;
  Result := Value;
end;

procedure WriteCompressedIntegerBE(Stream: TStream; AInt: DWord);
var
  Bits: Integer;
  Tmp: DWord;
  Buf: Byte;
begin
  Tmp := AInt;
  Bits := 0;
  while Tmp <> 0 do
  begin
    Tmp := Tmp shr 1;
    Inc(Bits);
  end;

  repeat
    Buf := (AInt shr (Tmp * 7)) and $7F;
    if Bits > 7 then
      Buf := Buf or $80;
    Dec(Bits, 7);
    Inc(Tmp);
    Stream.WriteByte(Buf);
  until Bits <= 0;
end;

function WriteScaleRootInt(ANumber: DWord; out Bits: DWord; Root: Integer): Byte;
var
  Tmp: DWord;
  Mask: DWord;
//  Scale: Integer;
  NeededBits: Integer;
  PrefixBits: Integer;
  RootBits: Integer;
begin
//  Scale := 2;
  Bits := 0;
  Result := Root;

  Tmp := ANumber;
  NeededBits := 0;
  while Tmp <> 0 do
  begin
    Inc(NeededBits);
    Tmp := Tmp shr 1;
  end;
  PrefixBits := Max(0, NeededBits-Root);

  RootBits := NeededBits -1; //
  if RootBits < Root then
    RootBits := Root;
  if RootBits < 0 then
    RootBits := 0;

  Mask := 0;
  if RootBits-1 >= 0 then
    for Tmp := 0 to RootBits-1 do
       Mask := Mask or (DWord(1) shl Tmp);
  Bits := not Mask;
  Bits := Bits shl 1; // make space for empty bit
  Bits := Bits or (ANumber and Mask);

  Result := PrefixBits + 1 + RootBits;
  Bits := (Bits shl (32-Result)) shr (32 - Result);
end;

{ TChmSearchWriter }

procedure TChmSearchWriter.ProcessWords;
begin
  FWordList.ForEach(@WriteAword);
  if FActiveLeafNode <> nil then
    FActiveLeafNode.Flush(False); // causes the unwritten parts of the tree to be written
end;

function TChmSearchWriter.GetHasData: Boolean;
begin
  Result := FWordList.IndexedFileCount > 0;
end;


procedure TChmSearchWriter.WriteHeader ( IsPlaceHolder: Boolean ) ;
var
  TmpNode: TFIftiNode;
  i: Integer;
begin
  if IsPlaceHolder then
  begin
    FStream.Size := $400; // the header size. we will fill this after the nodes have been determined
    FStream.Position := $400;
    FillChar(PAnsiChar(TMemoryStream(FStream).Memory)^, $400, 0);
    FHeaderRec.DocIndexRootSize := 1;
    FHeaderRec.CodeCountRootSize := 1;
    FHeaderRec.LocationCodeRootSize := 4;
    Exit;
  end;
  // write the glorious header
  FHeaderRec.Sig[2] := $28;
  FHeaderRec.HTMLFilesCount := FWordList.IndexedFileCount;
  FHeaderRec.RootNodeOffset := FStream.Size-FIFTI_NODE_SIZE;
  FHeaderRec.LeafNodeCount := TLeafNode(FActiveLeafNode).LeafNodeCount;
  FHeaderRec.CopyOfRootNodeOffset := FHeaderRec.RootNodeOffset;
  FHeaderRec.TreeDepth := 0;
  TmpNode := FActiveLeafNode;
  while TmpNode <> nil do
  begin
    Inc(FHeaderRec.TreeDepth);
    TmpNode := TmpNode.ParentNode;
  end;
  FHeaderRec.DocIndexScale := 2;
  FHeaderRec.CodeCountScale := 2;
  FHeaderRec.LocationCodeScale := 2;

  //FHeaderRec.DocIndexRootSize := 15;
  //FHeaderRec.CodeCountRootSize := 15;
  //FHeaderRec.LocationCodeRootSize := 15;

  FHeaderRec.NodeSize := FIFTI_NODE_SIZE;
  FHeaderRec.LongestWordLength := FWordList.LongestWord;
  FHeaderRec.TotalWordsIndexed := FWordList.TotalWordCount;
  FHeaderRec.TotalWords := FWordList.TotalDIfferentWords;
  FHeaderRec.TotalWordsLengthPart1 := FWordList.TotalWordLength;
  FHeaderRec.TotalWordsLength := FWordList.TotalDifferentWordLength;
  FHeaderRec.WindowsCodePage := 1252;

  FStream.Position := 0;

  FStream.Write(FHeaderRec.Sig[0], 4);
  FStream.WriteDWord(NtoLE(FHeaderRec.HTMLFilesCount));
  FStream.WriteDWord(NtoLE(FHeaderRec.RootNodeOffset));
  FStream.WriteDWord(NtoLE(0)); // unknown 1
  FStream.WriteDWord(NtoLE(FHeaderRec.LeafNodeCount));
  FStream.WriteDWord(NtoLE(FHeaderRec.RootNodeOffset)); // yes twice
  FStream.WriteWord(NtoLE(FHeaderRec.TreeDepth));
  FStream.WriteDWord(NtoLE(DWord(7)));
  FStream.WriteByte(2);
  FStream.WriteByte(FHeaderRec.DocIndexRootSize);
  FStream.WriteByte(2);
  FStream.WriteByte(FHeaderRec.CodeCountRootSize);
  FStream.WriteByte(2);
  FStream.WriteByte(FHeaderRec.LocationCodeRootSize);
  // eat 10 bytes
  FStream.WriteWord(0);
  FStream.WriteDWord(0);
  FStream.WriteDWord(0);

  FStream.WriteDWord(NtoLE(FHeaderRec.NodeSize));
  FStream.WriteDWord(NtoLE(DWord(0)));
  FStream.WriteDWord(1);
  FStream.WriteDWord(5);
  FStream.WriteDWord(NtoLE(FHeaderRec.LongestWordLength));
  FStream.WriteDWord(NtoLE(FHeaderRec.TotalWordsIndexed));
  FStream.WriteDWord(NtoLE(FHeaderRec.TotalWords));
  FStream.WriteDWord(NtoLE(FHeaderRec.TotalWordsLengthPart1));
  FStream.WriteDWord(NtoLE(FHeaderRec.TotalWordsLengthPart2));
  FStream.WriteDWord(NtoLE(FHeaderRec.TotalWordsLength));
  FStream.WriteDWord(NtoLE(TLeafNode(FActiveLeafNode).FreeSpace));
  FStream.WriteDWord(NtoLE(0));
  FStream.WriteDWord(NtoLE(FHeaderRec.HTMLFilesCount-1));
  for i := 0 to 23 do FStream.WriteByte(0);
  FStream.WriteDWord(NtoLE(FHeaderRec.WindowsCodePage));
  FStream.WriteDWord(NtoLE(DWord(1033))); // LCID
  for i := 0 to 893 do FStream.WriteByte(0);
end;

procedure TChmSearchWriter.WriteAWord ( AWord: TIndexedWord ) ;

begin
  if FActiveLeafNode = nil then
  begin
    FActiveLeafNode := TLeafNode.Create(FStream);
    with TLeafNode(FActiveLeafNode) do
    begin
      DocRootSize := FHeaderRec.DocIndexRootSize;
      CodeRootSize := FHeaderRec.CodeCountRootSize;
      LocRootSize := FHeaderRec.LocationCodeRootSize;
    end;
  end;
  if FActiveLeafNode.GuessIfCanHold(AWord.TheWord) = False then
  begin
    FActiveLeafNode.Flush(True);
  end;
  TLeafNode(FActiveLeafNode).AddWord(AWord);
end;

procedure TChmSearchWriter.WriteToStream;
begin
  WriteHeader(True);
  ProcessWords;
  WriteHeader(False);
end;

constructor TChmSearchWriter.Create ( AStream: TStream;
  AWordList: TIndexedWordList ) ;
begin
  FStream := AStream;
  FWordList := AWordList;
  FActiveLeafNode:=NIL;
end;

destructor TChmSearchWriter.Destroy;

begin
 freeandnil(FActiveLeafNode);
 inherited;
end;


{ TLeafNode }

function TFIftiNode.RemainingSpace: DWord;
begin
  Result := FIFTI_NODE_SIZE - FBlockStream.Position;
end;

constructor TFIftiNode.Create ( AStream: TStream ) ;
begin
  inherited Create;
  FWriteStream := AStream;
  FBlockStream := TMemoryStream.Create;
  OwnsParentNode :=false;
end;

destructor TFIftiNode.Destroy;
begin
  FBlockStream.Free;
  if OwnsParentNode then ParentNode.Free;
  inherited Destroy;
end;

procedure TFIftiNode.FillRemainingSpace;
begin
  while RemainingSpace > 0 do
    FBlockStream.WriteByte(0);
end;

function TFIftiNode.AdjustedWord ( AWord: String; out AOffset: Byte; AOldWord: String ) : String;
var
  Count1,
  Count2: Integer;
  Count: Integer;
  i: Integer;
begin
  if AWord = AOldWord then
  begin
    AOffset := Length(AWord);
    Exit('');
  end;
  // else
  Count1 := Length(AOldWord);
  Count2 := Length(AWord);

  if Count1<Count2 then
    Count := Count1
  else
    Count := Count2;

  for i := 1 to Count do
  begin
    AOffset := i-1;
    if AOldWord[i] <> AWord[i]
      then Exit(Copy(AWord, i, Length(AWord)));
  end;
  Result := AWord;
  AOffset := 0;
end;

procedure TLeafNode.WriteInitialHeader;
begin
  FBlockStream.WriteDWord(0);
  FBlockStream.WriteWord(0);
  FBlockStream.WriteWord(0);
end;

destructor TLeafNode.Destroy;
begin
  inherited Destroy;
end;

function TLeafNode.GuessIfCanHold ( AWord: String ) : Boolean;
var
  WordOffset: Byte;
begin
  Result := 17 + Length(AdjustedWord(AWord, WordOffset, FLastWord)) < RemainingSpace;
end;

procedure TLeafNode.Flush(NewBlockNeeded: Boolean);
var
  FTmpPos: DWord;
begin
  Inc(FLeafNodeCount);
  FTmpPos := FWriteStream.Position;
  // update the previous leaf node about our position.
  if FLastNodeStart > 0 then
  begin
    FWriteStream.Position := FLastNodeStart;
    FWriteStream.WriteDWord(NtoLE(FTmpPos));
    FWriteStream.Position := FTmpPos;
  end;
  FLastNodeStart := FTmpPos;

  FreeSpace := RemainingSpace;

  FillRemainingSpace;

  // update the leaf header to show the available space.
  FBlockStream.Position := 6;
  FBlockStream.WriteWord(NtoLE(Word(FreeSpace)));

  // copy the leaf block to the fiftimain file
  FBlockStream.Position := 0;
  FWriteStream.CopyFrom(FBlockStream, FIFTI_NODE_SIZE);
  FBlockStream.Position := 0;

  if NewBlockNeeded or ((NewBlockNeeded = False) and (ParentNode <> nil)) then
  begin
    if ParentNode = nil then
      begin
        ParentNode := TIndexNode.Create(FWriteStream);
        OwnsParentNode:=True;
      end;
    ParentNode.ChildIsFull(FLastWord, FLastNodeStart);
    if (NewBlockNeeded = False) then
      ParentNode.Flush(False);
  end;

  FLastWord := '';
end;

procedure TLeafNode.AddWord ( AWord: TIndexedWord ) ;
var
  Offset: Byte;
  NewWord: String;
  WLCSize: DWord;
begin
  if Length(AWord.TheWord) > 99 then
    Exit; // Maximum word length is 99
  if FBlockStream.Position = 0 then
    WriteInitialHeader;

  NewWord := AdjustedWord(AWord.TheWord, Offset, FLastWord);

  FLastWord := AWord.TheWord;

  FBlockStream.WriteByte(Length(NewWord)+1);
  FBlockStream.WriteByte(Offset);

  // length can be 0 if it is the same word as the last. there is a word entry each for title and content

  if Length(NewWord) > 0 then
    FBlockStream.Write(NewWord[1], Length(NewWord));

  FBlockStream.WriteByte(Ord(AWord.IsTitle));
  WriteCompressedIntegerBE(FBlockStream, AWord.DocumentCount);
  FBlockStream.WriteDWord(NtoLE(DWord(FWriteStream.Position)));
  FBlockStream.WriteWord(0);

  // write WLC to FWriteStream so we can write the size of the wlc entries
  WLCSize := WriteWLCEntries(AWord, FDocRootSize, FCodeRootSize, FLocRootSize);

  WriteCompressedIntegerBE(FBlockStream, WLCSize);
  if FBlockStream.Position > FIFTI_NODE_SIZE then
    raise Exception.Create('FIFTIMAIN Leaf node has written past the block!');
end;

function Min(AValue, BValue: Byte): Byte;
begin
  if AValue < BValue then
    Result := AValue
  else Result := BValue;
end;

function Max(AValue, BValue: Byte): Byte;
begin
  if AValue > BValue then
    Result := AValue
  else Result := BValue;
end;

function Max(AValue, BValue: Integer): Integer;
begin
  if AValue > BValue then
    Result := AValue
  else Result := BValue;
end;

function Max(AValue, BValue: DWord): DWord;
begin
  if AValue > BValue then
    Result := AValue
  else Result := BValue;
end;


function TLeafNode.WriteWLCEntries ( AWord: TIndexedWord ; ADocRootSize, ACodeRootSize, ALocRootSize: Byte) : DWord;
var
  LastDocIndex: DWord;
  LastLocCode: DWord;
  UsedBits: Byte;
  Buf: Byte;
  function NewDocDelta(ADocIndex: DWord): DWord;
  begin
    Result := ADocIndex - LastDocIndex;
    LastDocIndex := ADocIndex;
  end;
  function NewLocCode(ALocCode: DWord): DWord;
  begin
    Result := ALocCode - LastLocCode;
    LastLocCode := ALocCode;
  end;
  procedure AddValue(AValue: DWord; BitCount: Byte);
  var
    NeededBits: Byte;
    Tmp: Byte;
  begin
    AValue := AValue shl (32 - BitCount);
    while BitCount > 0 do
    begin
      NeededBits := 8 - UsedBits;
      Tmp := Hi(Hi(DWord(AValue shr (UsedBits))));
      Buf := Buf or Tmp;
      Inc(UsedBits, Min(BitCount, NeededBits));
      AValue := AValue shl Min(BitCount, NeededBits);
      Dec(BitCount, Min(BitCount, NeededBits));

      if (UsedBits = 8) then
      begin
        FWriteStream.WriteByte(Buf);
        UsedBits := 0;
        NeededBits := 0;
        Buf := 0;
      end;
    end;
  end;
  procedure FlushBuffer;
  begin
    if UsedBits > 0 then
      FWriteStream.WriteByte(Buf);
    UsedBits := 0;
    Buf := 0;
  end;
var
  DocDelta: DWord;
  LocDelta: DWord;
  StartPos: DWord;
  Bits: DWord;
  BitCount: Byte;
  i,
  j: Integer;
  Doc: TIndexDocument;
//  proced
begin
  StartPos := FWriteStream.Position;
  LastDocIndex := 0;
  UsedBits := 0;
  Buf := 0;
  for i := 0 to AWord.DocumentCount-1 do
  begin
    LastLocCode := 0;
    Doc := AWord.GetLogicalDocument(i);
    DocDelta := NewDocDelta(Doc.DocumentIndex);
    BitCount := WriteScaleRootInt(DocDelta, Bits, ADocRootSize);
    AddValue(Bits, BitCount);
    BitCount := WriteScaleRootInt(Doc.NumberOfIndexEntries, Bits, ACodeRootSize);
    AddValue(Bits, BitCount);

    for j := 0 to Doc.NumberOfIndexEntries-1 do
    begin
      LocDelta := NewLocCode(Doc.IndexEntry[j]);
      BitCount := WriteScaleRootInt(LocDelta, Bits, ALocRootSize);
      AddValue(Bits, BitCount);
    end;
    FlushBuffer;
  end;


  Result := FWriteStream.Position-StartPos;
end;


{ TIndexNode }

function TIndexNode.GuessIfCanHold ( AWord: String ) : Boolean;
var
  Offset: Byte;
begin
  Result := FBlockStream.Position + 8 + Length(AdjustedWord(AWord, Offset, FLastWord)) < FIFTI_NODE_SIZE;
end;

procedure TIndexNode.ChildIsFull ( AWord: String; ANodeOffset: DWord ) ;
var
  Offset: Byte;
  NewWord: String;
begin
  if FBlockStream.Position = 0 then
    FBlockStream.WriteWord(0); // free space at end. updated when the block is flushed
  if GuessIfCanHold(AWord) = False then
    Flush(True);
  NewWord := AdjustedWord(AWord, Offset, FLastWord);
  FLastWord:=AWord;

  // Write the Index node Entry
  FBlockStream.WriteByte(Length(NewWord)+1);
  FBlockStream.WriteByte(Offset);
  FBlockStream.Write(NewWord[1], Length(NewWord));
  FBlockStream.WriteDWord(NtoLE(ANodeOffset));
  FBlockStream.WriteWord(0);
  if FBlockStream.Position > FIFTI_NODE_SIZE then
    raise Exception.Create('FIFTIMAIN Index node has written past the block!');
end;

procedure TIndexNode.Flush ( NewBlockNeeded: Boolean ) ;
var
  RemSize: DWord;
begin
  if NewBlockNeeded then
  begin
    if ParentNode = nil then
      begin
        ParentNode := TIndexNode.Create(FWriteStream);
        OwnsParentNode:=True;
      end;
  end;

  if ParentNode <> nil then
    ParentNode.ChildIsFull(FLastWord, FWriteStream.Position);

  RemSize := RemainingSpace;
  FillRemainingSpace;
  FBlockStream.Position := 0;
  FBlockStream.WriteWord(NtoLE(RemSize));

  FBlockStream.Position := 0;

  FWriteStream.CopyFrom(FBlockStream, FIFTI_NODE_SIZE);
  FBlockStream.Position := 0;

  FLastWord := '';

  if NewBlockNeeded then
    FBlockStream.WriteDWord(0) // placeholder to write free space in when block is full
  else
    if ParentNode <> nil then
      ParentNode.Flush(NewBlockNeeded);
end;

{ TChmSearchReader }

procedure TChmSearchReader.ReadCommonData;
var
  Sig: DWord;
begin
   FStream.Position := 0;
   Sig := LEtoN(FStream.ReadDWord);
   FFileIsValid :=  Sig = $00280000;

   if not FileIsValid then
     Exit;

   // root node address
   FStream.Position := $8;
   FRootNodeOffset := LEtoN(FStream.ReadDWord);

   // Tree Depth
   FStream.Position := $18;
   FTreeDepth := LEtoN(FStream.ReadWord);

   // Root sizes for scale and root integers
   FStream.Position := $1E;
   if FStream.ReadByte <> 2 then // we only can read the files when scale is 2
     FFileIsValid := False;
   FDocRootSize := FStream.ReadByte;

   if FStream.ReadByte <> 2 then
     FFileIsValid := False;
   FCodeCountRootSize := FStream.ReadByte;

   if FStream.ReadByte <> 2 then
     FFileIsValid := False;
   FLocCodeRootSize := FStream.ReadByte;

end;

procedure TChmSearchReader.MoveToFirstLeafNode;
var
  NodeDepth: Integer;
  NodeOffset: DWord;
  LastWord: String;
  NewWord: String;
begin
  NodeDepth := FTreeDepth;
  MoveToRootNode;
  while NodeDepth > 1 do
  begin
    LastWord := '';
    ReadIndexNodeEntry(LastWord, NewWord, NodeOffset);
    Dec(NodeDepth);
    MoveToNode(NodeOffset, NodeDepth);
  end;
end;

procedure TChmSearchReader.MoveToRootNode;
begin
  MoveToNode(FRootNodeOffset, FTreeDepth);
end;

procedure TChmSearchReader.MoveToNode(ANodeOffset: DWord; ANodeDepth: Integer);
begin
  FStream.Position := ANodeOffset;
  FActiveNodeStart := FStream.Position;
  if ANodeDepth > 1 then
  begin
    FnextLeafNode := 0;
    FActiveNodeFreeSpace := LEtoN(FStream.ReadWord); // empty space at end of node
  end
  else
  begin
    FnextLeafNode := LEtoN(FStream.ReadDWord);
    FStream.ReadWord;
    FActiveNodeFreeSpace := LEtoN(FStream.ReadWord);
  end;
end;

function TChmSearchReader.ReadWordOrPartialWord ( ALastWord: String ) : String;
var
  WordLength: Integer;
  CopyLastWordCharCount: Integer;
begin
  WordLength := FStream.ReadByte;
  CopyLastWordCharCount := FStream.ReadByte;
  if CopyLastWordCharCount > 0 then
    Result := Copy(ALastWord, 1, CopyLastWordCharCount);
  SetLength(Result, (WordLength-1) + CopyLastWordCharCount);
  if WordLength > 1 then
    FStream.Read(Result[1+CopyLastWordCharCount], WordLength-1);
end;

function TChmSearchReader.ReadIndexNodeEntry (ALastWord: String;  out AWord: String; out
  ASubNodeStart: DWord ): Boolean;
begin
  Result := FStream.Position - FActiveNodeStart < FIFTI_NODE_SIZE - FActiveNodeFreeSpace;
  if not Result then
    Exit;
  AWord := ReadWordOrPartialWord(ALastWord);
  ASubNodeStart := LEtoN(FStream.ReadDWord);
  FStream.ReadWord;
end;

function TChmSearchReader.ReadLeafNodeEntry ( ALastWord: String; out
  AWord: String; out AInTitle: Boolean; out AWLCCount: DWord; out
  AWLCOffset: DWord; out AWLCSize: DWord ): Boolean;
begin
  Result := FStream.Position - FActiveNodeStart < FIFTI_NODE_SIZE - FActiveNodeFreeSpace;
  if not Result then
    Exit;
  AWord := ReadWordOrPartialWord(ALastWord);
  AInTitle := FStream.ReadByte = 1;
  AWLCCount := GetCompressedIntegerBE(FStream);
  AWLCOffset := LEtoN(FStream.ReadDWord);
  FStream.ReadWord;
  AWLCSize := GetCompressedIntegerBE(FStream);

end;

function TChmSearchReader.ReadWLCEntries (AWLCCount: DWord; AWLCOffset: DWord; AWLCSize: DWord ) : TChmWLCTopicArray;

var
  Buf: Byte;
  BitsInBuffer: Integer;
  FinalPosition: int64;

  function GetNextByte(): Boolean;
  begin
    Result := (FStream.Position < FinalPosition);
    if Result then
    begin
      Buf := FStream.ReadByte;
      Inc(BitsInBuffer, 8);
    end;
  end;

  function ShiftBuffer: Boolean;
  begin
    Buf := (Buf and $7F) shl 1;
    Dec(BitsInBuffer);
    Result := (BitsInBuffer > 0) or GetNextByte();
  end;

  function ReadWLC(RootSize: DWord): DWord;
  var
    PrefixBits: Integer = 0;
    RemainingBits: Integer; // only the bits for this number not the bits in buffer
  begin
    if (BitsInBuffer = 0) then
      GetNextByte();
    Result := (Buf and $80) shr 7;

    while (Buf and $80) > 0 do // find out how many prefix bits there are
    begin
      Inc(PrefixBits);
      if not ShiftBuffer then
        Exit;
    end;

    // skip divider (zero) bit
    if not ShiftBuffer then
      Exit;

    Remainingbits := RootSize + Max(Integer(PrefixBits-1), 0);
    while RemainingBits > 0 do
    begin
      Result := Result shl 1;
      Result := Result or (Buf shr 7);
      Dec(RemainingBits);
      if not ShiftBuffer then
        Exit;
    end;
  end;

  procedure ClearBuffer;
  begin
    if BitsInBuffer < 8 then
    begin
      BitsInBuffer := 0;
      Buf := 0;
    end;
  end;

var
  TopicHits: DWord;
  i: Integer;
  j: Integer;
  CachedStreamPos: QWord;
  LastDoc,
  LastLocCode: DWord;
begin
  FinalPosition := AWLCOffset + AWLCSize;
  CachedStreamPos := FStream.Position;
  FStream.Position := AWLCOffset;
  {for i := 0 to AWLCSize-1 do
  begin
    Buf := FStream.ReadByte;
    Write(binStr(Buf, 8), ' ');
  end;}
  FStream.Position := AWLCOffset;
  SetLength(Result, AWLCCount);
  Buf := 0;
  BitsInBuffer := 0;
  LastDoc := 0;

  for i := 0 to AWLCCount-1 do
  begin
    Result[i].TopicIndex := ReadWLC(FDocRootSize) + LastDoc;

    LastDoc := Result[i].TopicIndex;
    TopicHits := ReadWLC(FCodeCountRootSize);
    SetLength(Result[i].LocationCodes, TopicHits);
    LastLocCode := 0;
    for j := 0 to TopicHits-1 do
    begin
      Result[i].LocationCodes[j] := ReadWLC(FLocCodeRootSize) + LastLocCode;
      LastLocCode := Result[i].LocationCodes[j];
    end;
    ClearBuffer;
  end;
  FStream.Position := CachedStreamPos;
end;


constructor TChmSearchReader.Create ( AStream: TStream;
  AFreeStreamOnDestroy: Boolean ) ;
begin
  FStream := AStream;
  FFreeStreamOnDestroy := AFreeStreamOnDestroy;
  ReadCommonData;
end;

destructor TChmSearchReader.Destroy;
begin
  if FFreeStreamOnDestroy then
    FreeAndNil(FStream);
  inherited Destroy;
end;

procedure TChmSearchReader.DumpData (
  AFoundDataEvent: TChmSearchReaderFoundDataEvent ) ;
var
  LastWord: String;
  TheWord: String;
  InTitle: Boolean;
  WLCCount: DWord;
  WLCOffset: DWord;
  WLCSize: DWord;
  FoundHits: TChmWLCTopicArray;
  i: Integer;
  j: Integer;
begin
  MoveToFirstLeafNode;
  LastWord := '';
  repeat
    if  (ReadLeafNodeEntry(LastWord, TheWord, InTitle, WLCCount, WLCOffset, WLCSize) = False) then
    begin
      if FnextLeafNode <> 0 then
      begin
        MoveToNode(FnextLeafNode, 1);
        LastWord := '';
      end
      else
        Break;
    end
    else begin
      LastWord := TheWord;
      //WriteLn('Reading Hits for ', TheWord ,' at ', hexstr(WLCOffset,8) );
      FoundHits := ReadWLCEntries(WLCCount, WLCOffset, WLCSize);
      //WriteLn('DONE Reading Hits for ', TheWord);
     // AFoundDataEvent(Self, TheWord, 0,0);//FoundHits[i].TopicIndex ,-1);//FoundHits[i].LocationCodes[j]);
      for i := 0 to High(FoundHits) do
        for j := 0 to High(FoundHits[i].LocationCodes) do
           AFoundDataEvent(Self, TheWord, FoundHits[i].TopicIndex ,FoundHits[i].LocationCodes[j]);
    end;
  until False; //FStream.Position - FActiveNodeStart >= FIFTI_NODE_SIZE - FActiveNodeFreeSpace
end;

function TChmSearchReader.LookupWord(AWord: String; out ATitleHits: TChmWLCTopicArray; AStartsWith: Boolean = True): TChmWLCTopicArray;
var
  LastWord: String;
  NewWord: String;
  NodeLevel: Integer;
  NewNodePosition: DWord;
  InTitle: Boolean;
  WLCCount: DWord;
  WLCOffset: DWord;
  WLCSize: DWord;
  CompareResult: Integer;
  ReadNextResult: Boolean;
begin
  AWord := LowerCase(AWord);
  NodeLevel := FTreeDepth;
  MoveToRootNode;
  SetLength(Result, 0);
  LastWord := '';
  // descend the index node tree until we find the leafnode
  while NodeLevel > 1 do begin
     //WriteLn('At Node Level ', NodeLevel);
     if ReadIndexNodeEntry(LastWord, NewWord, NewNodePosition) <> False then
     begin
       LastWord := NewWord;
       //WriteLn('Found Index Entry: ', NewWord, ' Comparing to ', AWord);
       if  ChmCompareText(NewWord, AWord) >= 0 then
       begin
         LastWord := '';
         Dec(NodeLevel);
         MoveToNode(NewNodePosition, NodeLevel);
       end;
     end
     else
       Break;
  end;
  if NodeLevel > 1 then
    Exit; // the entry we are looking for is > than the last entry of the last index node

  // now we are in a leafnode
  while ReadLeafNodeEntry(LastWord, NewWord, InTitle, WLCCount, WLCOffset, WLCSize) <> False do
  begin
    //WriteLn('Found Leaf Entry: ', NewWord, ' Comparing to ', AWord);
    LastWord := NewWord;
    if Length(NewWord) < Length(AWord) then
      continue;

    if AStartsWith then //it only has to start with the searched term
      CompareResult := ChmCompareText(AWord, Copy(NewWord, 1, Length(AWord)))
    else // it must match exactly
      CompareResult := ChmCompareText(AWord, NewWord);
    if CompareResult < 0 then
      Exit;
    if CompareResult = 0 then
    begin
      if InTitle then
        ATitleHits := ReadWLCEntries(WLCCount, WLCOffset, WLCSize)
      else
        Result := ReadWLCEntries(WLCCount, WLCOffset, WLCSize);
      // check if the next entry is the same word since there is an entry for titles and for body

      if  (ReadLeafNodeEntry(LastWord, NewWord, InTitle, WLCCount, WLCOffset, WLCSize)) then
        ReadNextResult := True
      else if (FNextLeafNode <> 0) then
      begin
        MoveToNode(FNextLeafNode, 1);
        LastWord := '';
        ReadNextResult := (ReadLeafNodeEntry(LastWord, NewWord, InTitle, WLCCount, WLCOffset, WLCSize));
      end;
      if ReadNextResult and (NewWord = AWord) then
      begin
        if InTitle then
          ATitleHits := ReadWLCEntries(WLCCount, WLCOffset, WLCSize)
        else
          Result := ReadWLCEntries(WLCCount, WLCOffset, WLCSize);
      end;
      Exit;
    end;
  end;
end;


end.