{ Copyright (C) <2008> <Andrew Haines> chmfiftimain.pas This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; if not, write to the Free Software Foundation, Inc., i51 Franklin Street, Fifth Floor, Boston, MA 02111-1301, USA. } { See the file COPYING.FPC, included in this distribution, for details about the copyright. } {$IFNDEF FPC_DOTTEDUNITS} unit chmFiftiMain; {$ENDIF FPC_DOTTEDUNITS} {$mode objfpc}{$H+} interface {$IFDEF FPC_DOTTEDUNITS} uses System.Classes, Chm.HtmlIndexer; {$ELSE FPC_DOTTEDUNITS} uses Classes, HTMLIndexer; {$ENDIF FPC_DOTTEDUNITS} type TFiftiMainHeader = record Sig: array [0..3] of byte; //$00,$00,$28,$00 HTMLFilesCount: DWord; RootNodeOffset: DWord; Unknown1: DWord; // = 0 LeafNodeCount: DWord; CopyOfRootNodeOffset: DWord; TreeDepth: Word; Unknown2: DWord; // = 7 DocIndexScale: Byte; DocIndexRootSize: Byte; CodeCountScale: Byte; CodeCountRootSize: Byte; LocationCodeScale: Byte; LocationCodeRootSize: Byte; Unknown3: array[0..9] of byte; // = 0 NodeSize: DWord; // 4096; Unknown4: DWord; // 0 or 1; LastDupWordIndex: DWord; LastDupCharIndex: DWord; LongestWordLength: DWord; // maximum 99 TotalWordsIndexed: DWord; // includes duplicates TotalWords: DWord; // word count not including duplicates TotalWordsLengthPart1: DWord; // length of all the words with duplicates plus the next dword! TotalWordsLengthPart2: DWord; TotalWordsLength: DWord; // length of all words not including duplicates WordBlockUnusedBytes: DWord; // who knows, this makes no sense when there are more than one blocks Unknown5: DWord; // 0 HTMLFilesCountMinusOne: DWord; // maybe Unknown6: array[0..23] of Byte; // 0 WindowsCodePage: DWord; // usually 1252 LocalID: DWord; //Unknown7: array [0..893] of Byte; // 0 end; { TFIftiNode } TFIftiNode = class(TObject) FLastWord: String; FWriteStream: TStream; FBlockStream: TMemoryStream; ParentNode: TFIftiNode; OwnsParentNode : boolean; function AdjustedWord(AWord: String; out AOffset: Byte; AOldWord: String): String; procedure ChildIsFull(AWord: String; ANodeOffset: DWord); virtual; abstract; function GuessIfCanHold(AWord: String): Boolean; virtual; abstract; procedure Flush(NewBlockNeeded: Boolean); virtual; abstract; procedure FillRemainingSpace; function RemainingSpace: DWord; constructor Create(AStream: TStream); destructor Destroy; override; end; { TChmSearchWriter } TChmSearchWriter = class(TObject) private FHeaderRec: TFiftiMainHeader; FStream: TStream; FWordList: TIndexedWordList; FActiveLeafNode: TFIftiNode; function GetHasData: Boolean; procedure ProcessWords; procedure WriteHeader(IsPlaceHolder: Boolean); procedure WriteAWord(AWord: TIndexedWord); public procedure WriteToStream; property HasData: Boolean read GetHasData; constructor Create(AStream: TStream; AWordList: TIndexedWordList); destructor Destroy; override; end; { TChmSearchReader } TChmWLCTopic = record TopicIndex: DWord; LocationCodes: array of DWord; end; TChmWLCTopicArray = array of TChmWLCTopic; TChmSearchReader = class; TChmSearchReaderFoundDataEvent = procedure(Sender: TChmSearchReader; AWord: String; ATopic: DWord; AWordIndex: DWord) of object; TChmSearchReader = class(TObject) private FStream: TStream; FFileIsValid: Boolean; FFreeStreamOnDestroy: Boolean; FDocRootSize, FCodeCountRootSize, FLocCodeRootSize: Integer; FTreeDepth: Integer; FRootNodeOffset: DWord; FActiveNodeStart: DWord; FActiveNodeFreeSpace: Word; FNextLeafNode: DWord; procedure ReadCommonData; procedure MoveToFirstLeafNode; procedure MoveToRootNode; procedure MoveToNode(ANodeOffset: DWord; ANodeDepth: Integer); function ReadWordOrPartialWord(ALastWord: String): String; // returns the whole word using the last word as a base function ReadIndexNodeEntry(ALastWord: String; out AWord: String; out ASubNodeStart: DWord): Boolean; function ReadLeafNodeEntry(ALastWord: String; out AWord: String; out AInTitle: Boolean; out AWLCCount: DWord; out AWLCOffset: DWord; out AWLCSize: DWord): Boolean; function ReadWLCEntries(AWLCCount: DWord; AWLCOffset: DWord; AWLCSize: DWord): TChmWLCTopicArray; public constructor Create(AStream: TStream; AFreeStreamOnDestroy: Boolean); destructor Destroy; override; procedure DumpData(AFoundDataEvent: TChmSearchReaderFoundDataEvent); function LookupWord(AWord: String; out ATitleHits: TChmWLCTopicArray; AStartsWith: Boolean = True): TChmWLCTopicArray; property FileIsValid: Boolean read FFileIsValid; end; const FIFTI_NODE_SIZE = 4096; implementation {$IFDEF FPC_DOTTEDUNITS} uses System.SysUtils, System.Math, Chm.Base; {$ELSE FPC_DOTTEDUNITS} uses SysUtils, Math, ChmBase; {$ENDIF FPC_DOTTEDUNITS} type { TIndexNode } TIndexNode = class(TFIftiNode) function GuessIfCanHold(AWord: String): Boolean; override; procedure ChildIsFull ( AWord: String; ANodeOffset: DWord ); override; procedure Flush(NewBlockNeeded: Boolean); override; end; { TLeafNode } TLeafNode = class(TFIftiNode) FLeafNodeCount: DWord; FLastNodeStart: DWord; FreeSpace: DWord; FDocRootSize, FCodeRootSize, FLocRootSize: Byte; procedure WriteInitialHeader; Destructor Destroy; override; function GuessIfCanHold(AWord: String): Boolean; override; procedure Flush(NewBlockNeeded: Boolean); override; procedure AddWord(AWord: TIndexedWord); function WriteWLCEntries(AWord: TIndexedWord; ADocRootSize, ACodeRootSize, ALocRootSize: Byte): DWord; property LeafNodeCount: DWord read FLeafNodeCount; property DocRootSize: Byte read FDocRootSize write FDocRootSize; property CodeRootSize: Byte read FCodeRootSize write FCodeRootSize; property LocRootSize: Byte read FLocRootSize write FLocRootSize; end; function GetCompressedIntegerBE(Stream: TStream): DWord; var Buf: Byte; Value: Dword = 0; Shift: Integer = 0; begin repeat Buf := Stream.ReadByte; Value := Value or (Buf and $7F) shl Shift; Inc(Shift, 7); until (Buf and $80) = 0; Result := Value; end; procedure WriteCompressedIntegerBE(Stream: TStream; AInt: DWord); var Bits: Integer; Tmp: DWord; Buf: Byte; begin Tmp := AInt; Bits := 0; while Tmp <> 0 do begin Tmp := Tmp shr 1; Inc(Bits); end; repeat Buf := (AInt shr (Tmp * 7)) and $7F; if Bits > 7 then Buf := Buf or $80; Dec(Bits, 7); Inc(Tmp); Stream.WriteByte(Buf); until Bits <= 0; end; function WriteScaleRootInt(ANumber: DWord; out Bits: DWord; Root: Integer): Byte; var Tmp: DWord; Mask: DWord; // Scale: Integer; NeededBits: Integer; PrefixBits: Integer; RootBits: Integer; begin // Scale := 2; Bits := 0; Result := Root; Tmp := ANumber; NeededBits := 0; while Tmp <> 0 do begin Inc(NeededBits); Tmp := Tmp shr 1; end; PrefixBits := Max(0, NeededBits-Root); RootBits := NeededBits -1; // if RootBits < Root then RootBits := Root; if RootBits < 0 then RootBits := 0; Mask := 0; if RootBits-1 >= 0 then for Tmp := 0 to RootBits-1 do Mask := Mask or (DWord(1) shl Tmp); Bits := not Mask; Bits := Bits shl 1; // make space for empty bit Bits := Bits or (ANumber and Mask); Result := PrefixBits + 1 + RootBits; Bits := (Bits shl (32-Result)) shr (32 - Result); end; { TChmSearchWriter } procedure TChmSearchWriter.ProcessWords; begin FWordList.ForEach(@WriteAword); if FActiveLeafNode <> nil then FActiveLeafNode.Flush(False); // causes the unwritten parts of the tree to be written end; function TChmSearchWriter.GetHasData: Boolean; begin Result := FWordList.IndexedFileCount > 0; end; procedure TChmSearchWriter.WriteHeader ( IsPlaceHolder: Boolean ) ; var TmpNode: TFIftiNode; i: Integer; begin if IsPlaceHolder then begin FStream.Size := $400; // the header size. we will fill this after the nodes have been determined FStream.Position := $400; FillChar(PAnsiChar(TMemoryStream(FStream).Memory)^, $400, 0); FHeaderRec.DocIndexRootSize := 1; FHeaderRec.CodeCountRootSize := 1; FHeaderRec.LocationCodeRootSize := 4; Exit; end; // write the glorious header FHeaderRec.Sig[2] := $28; FHeaderRec.HTMLFilesCount := FWordList.IndexedFileCount; FHeaderRec.RootNodeOffset := FStream.Size-FIFTI_NODE_SIZE; FHeaderRec.LeafNodeCount := TLeafNode(FActiveLeafNode).LeafNodeCount; FHeaderRec.CopyOfRootNodeOffset := FHeaderRec.RootNodeOffset; FHeaderRec.TreeDepth := 0; TmpNode := FActiveLeafNode; while TmpNode <> nil do begin Inc(FHeaderRec.TreeDepth); TmpNode := TmpNode.ParentNode; end; FHeaderRec.DocIndexScale := 2; FHeaderRec.CodeCountScale := 2; FHeaderRec.LocationCodeScale := 2; //FHeaderRec.DocIndexRootSize := 15; //FHeaderRec.CodeCountRootSize := 15; //FHeaderRec.LocationCodeRootSize := 15; FHeaderRec.NodeSize := FIFTI_NODE_SIZE; FHeaderRec.LongestWordLength := FWordList.LongestWord; FHeaderRec.TotalWordsIndexed := FWordList.TotalWordCount; FHeaderRec.TotalWords := FWordList.TotalDIfferentWords; FHeaderRec.TotalWordsLengthPart1 := FWordList.TotalWordLength; FHeaderRec.TotalWordsLength := FWordList.TotalDifferentWordLength; FHeaderRec.WindowsCodePage := 1252; FStream.Position := 0; FStream.Write(FHeaderRec.Sig[0], 4); FStream.WriteDWord(NtoLE(FHeaderRec.HTMLFilesCount)); FStream.WriteDWord(NtoLE(FHeaderRec.RootNodeOffset)); FStream.WriteDWord(NtoLE(0)); // unknown 1 FStream.WriteDWord(NtoLE(FHeaderRec.LeafNodeCount)); FStream.WriteDWord(NtoLE(FHeaderRec.RootNodeOffset)); // yes twice FStream.WriteWord(NtoLE(FHeaderRec.TreeDepth)); FStream.WriteDWord(NtoLE(DWord(7))); FStream.WriteByte(2); FStream.WriteByte(FHeaderRec.DocIndexRootSize); FStream.WriteByte(2); FStream.WriteByte(FHeaderRec.CodeCountRootSize); FStream.WriteByte(2); FStream.WriteByte(FHeaderRec.LocationCodeRootSize); // eat 10 bytes FStream.WriteWord(0); FStream.WriteDWord(0); FStream.WriteDWord(0); FStream.WriteDWord(NtoLE(FHeaderRec.NodeSize)); FStream.WriteDWord(NtoLE(DWord(0))); FStream.WriteDWord(1); FStream.WriteDWord(5); FStream.WriteDWord(NtoLE(FHeaderRec.LongestWordLength)); FStream.WriteDWord(NtoLE(FHeaderRec.TotalWordsIndexed)); FStream.WriteDWord(NtoLE(FHeaderRec.TotalWords)); FStream.WriteDWord(NtoLE(FHeaderRec.TotalWordsLengthPart1)); FStream.WriteDWord(NtoLE(FHeaderRec.TotalWordsLengthPart2)); FStream.WriteDWord(NtoLE(FHeaderRec.TotalWordsLength)); FStream.WriteDWord(NtoLE(TLeafNode(FActiveLeafNode).FreeSpace)); FStream.WriteDWord(NtoLE(0)); FStream.WriteDWord(NtoLE(FHeaderRec.HTMLFilesCount-1)); for i := 0 to 23 do FStream.WriteByte(0); FStream.WriteDWord(NtoLE(FHeaderRec.WindowsCodePage)); FStream.WriteDWord(NtoLE(DWord(1033))); // LCID for i := 0 to 893 do FStream.WriteByte(0); end; procedure TChmSearchWriter.WriteAWord ( AWord: TIndexedWord ) ; begin if FActiveLeafNode = nil then begin FActiveLeafNode := TLeafNode.Create(FStream); with TLeafNode(FActiveLeafNode) do begin DocRootSize := FHeaderRec.DocIndexRootSize; CodeRootSize := FHeaderRec.CodeCountRootSize; LocRootSize := FHeaderRec.LocationCodeRootSize; end; end; if FActiveLeafNode.GuessIfCanHold(AWord.TheWord) = False then begin FActiveLeafNode.Flush(True); end; TLeafNode(FActiveLeafNode).AddWord(AWord); end; procedure TChmSearchWriter.WriteToStream; begin WriteHeader(True); ProcessWords; WriteHeader(False); end; constructor TChmSearchWriter.Create ( AStream: TStream; AWordList: TIndexedWordList ) ; begin FStream := AStream; FWordList := AWordList; FActiveLeafNode:=NIL; end; destructor TChmSearchWriter.Destroy; begin freeandnil(FActiveLeafNode); inherited; end; { TLeafNode } function TFIftiNode.RemainingSpace: DWord; begin Result := FIFTI_NODE_SIZE - FBlockStream.Position; end; constructor TFIftiNode.Create ( AStream: TStream ) ; begin inherited Create; FWriteStream := AStream; FBlockStream := TMemoryStream.Create; OwnsParentNode :=false; end; destructor TFIftiNode.Destroy; begin FBlockStream.Free; if OwnsParentNode then ParentNode.Free; inherited Destroy; end; procedure TFIftiNode.FillRemainingSpace; begin while RemainingSpace > 0 do FBlockStream.WriteByte(0); end; function TFIftiNode.AdjustedWord ( AWord: String; out AOffset: Byte; AOldWord: String ) : String; var Count1, Count2: Integer; Count: Integer; i: Integer; begin if AWord = AOldWord then begin AOffset := Length(AWord); Exit(''); end; // else Count1 := Length(AOldWord); Count2 := Length(AWord); if Count1<Count2 then Count := Count1 else Count := Count2; for i := 1 to Count do begin AOffset := i-1; if AOldWord[i] <> AWord[i] then Exit(Copy(AWord, i, Length(AWord))); end; Result := AWord; AOffset := 0; end; procedure TLeafNode.WriteInitialHeader; begin FBlockStream.WriteDWord(0); FBlockStream.WriteWord(0); FBlockStream.WriteWord(0); end; destructor TLeafNode.Destroy; begin inherited Destroy; end; function TLeafNode.GuessIfCanHold ( AWord: String ) : Boolean; var WordOffset: Byte; begin Result := 17 + Length(AdjustedWord(AWord, WordOffset, FLastWord)) < RemainingSpace; end; procedure TLeafNode.Flush(NewBlockNeeded: Boolean); var FTmpPos: DWord; begin Inc(FLeafNodeCount); FTmpPos := FWriteStream.Position; // update the previous leaf node about our position. if FLastNodeStart > 0 then begin FWriteStream.Position := FLastNodeStart; FWriteStream.WriteDWord(NtoLE(FTmpPos)); FWriteStream.Position := FTmpPos; end; FLastNodeStart := FTmpPos; FreeSpace := RemainingSpace; FillRemainingSpace; // update the leaf header to show the available space. FBlockStream.Position := 6; FBlockStream.WriteWord(NtoLE(Word(FreeSpace))); // copy the leaf block to the fiftimain file FBlockStream.Position := 0; FWriteStream.CopyFrom(FBlockStream, FIFTI_NODE_SIZE); FBlockStream.Position := 0; if NewBlockNeeded or ((NewBlockNeeded = False) and (ParentNode <> nil)) then begin if ParentNode = nil then begin ParentNode := TIndexNode.Create(FWriteStream); OwnsParentNode:=True; end; ParentNode.ChildIsFull(FLastWord, FLastNodeStart); if (NewBlockNeeded = False) then ParentNode.Flush(False); end; FLastWord := ''; end; procedure TLeafNode.AddWord ( AWord: TIndexedWord ) ; var Offset: Byte; NewWord: String; WLCSize: DWord; begin if Length(AWord.TheWord) > 99 then Exit; // Maximum word length is 99 if FBlockStream.Position = 0 then WriteInitialHeader; NewWord := AdjustedWord(AWord.TheWord, Offset, FLastWord); FLastWord := AWord.TheWord; FBlockStream.WriteByte(Length(NewWord)+1); FBlockStream.WriteByte(Offset); // length can be 0 if it is the same word as the last. there is a word entry each for title and content if Length(NewWord) > 0 then FBlockStream.Write(NewWord[1], Length(NewWord)); FBlockStream.WriteByte(Ord(AWord.IsTitle)); WriteCompressedIntegerBE(FBlockStream, AWord.DocumentCount); FBlockStream.WriteDWord(NtoLE(DWord(FWriteStream.Position))); FBlockStream.WriteWord(0); // write WLC to FWriteStream so we can write the size of the wlc entries WLCSize := WriteWLCEntries(AWord, FDocRootSize, FCodeRootSize, FLocRootSize); WriteCompressedIntegerBE(FBlockStream, WLCSize); if FBlockStream.Position > FIFTI_NODE_SIZE then raise Exception.Create('FIFTIMAIN Leaf node has written past the block!'); end; function Min(AValue, BValue: Byte): Byte; begin if AValue < BValue then Result := AValue else Result := BValue; end; function Max(AValue, BValue: Byte): Byte; begin if AValue > BValue then Result := AValue else Result := BValue; end; function Max(AValue, BValue: Integer): Integer; begin if AValue > BValue then Result := AValue else Result := BValue; end; function Max(AValue, BValue: DWord): DWord; begin if AValue > BValue then Result := AValue else Result := BValue; end; function TLeafNode.WriteWLCEntries ( AWord: TIndexedWord ; ADocRootSize, ACodeRootSize, ALocRootSize: Byte) : DWord; var LastDocIndex: DWord; LastLocCode: DWord; UsedBits: Byte; Buf: Byte; function NewDocDelta(ADocIndex: DWord): DWord; begin Result := ADocIndex - LastDocIndex; LastDocIndex := ADocIndex; end; function NewLocCode(ALocCode: DWord): DWord; begin Result := ALocCode - LastLocCode; LastLocCode := ALocCode; end; procedure AddValue(AValue: DWord; BitCount: Byte); var NeededBits: Byte; Tmp: Byte; begin AValue := AValue shl (32 - BitCount); while BitCount > 0 do begin NeededBits := 8 - UsedBits; Tmp := Hi(Hi(DWord(AValue shr (UsedBits)))); Buf := Buf or Tmp; Inc(UsedBits, Min(BitCount, NeededBits)); AValue := AValue shl Min(BitCount, NeededBits); Dec(BitCount, Min(BitCount, NeededBits)); if (UsedBits = 8) then begin FWriteStream.WriteByte(Buf); UsedBits := 0; NeededBits := 0; Buf := 0; end; end; end; procedure FlushBuffer; begin if UsedBits > 0 then FWriteStream.WriteByte(Buf); UsedBits := 0; Buf := 0; end; var DocDelta: DWord; LocDelta: DWord; StartPos: DWord; Bits: DWord; BitCount: Byte; i, j: Integer; Doc: TIndexDocument; // proced begin StartPos := FWriteStream.Position; LastDocIndex := 0; UsedBits := 0; Buf := 0; for i := 0 to AWord.DocumentCount-1 do begin LastLocCode := 0; Doc := AWord.GetLogicalDocument(i); DocDelta := NewDocDelta(Doc.DocumentIndex); BitCount := WriteScaleRootInt(DocDelta, Bits, ADocRootSize); AddValue(Bits, BitCount); BitCount := WriteScaleRootInt(Doc.NumberOfIndexEntries, Bits, ACodeRootSize); AddValue(Bits, BitCount); for j := 0 to Doc.NumberOfIndexEntries-1 do begin LocDelta := NewLocCode(Doc.IndexEntry[j]); BitCount := WriteScaleRootInt(LocDelta, Bits, ALocRootSize); AddValue(Bits, BitCount); end; FlushBuffer; end; Result := FWriteStream.Position-StartPos; end; { TIndexNode } function TIndexNode.GuessIfCanHold ( AWord: String ) : Boolean; var Offset: Byte; begin Result := FBlockStream.Position + 8 + Length(AdjustedWord(AWord, Offset, FLastWord)) < FIFTI_NODE_SIZE; end; procedure TIndexNode.ChildIsFull ( AWord: String; ANodeOffset: DWord ) ; var Offset: Byte; NewWord: String; begin if FBlockStream.Position = 0 then FBlockStream.WriteWord(0); // free space at end. updated when the block is flushed if GuessIfCanHold(AWord) = False then Flush(True); NewWord := AdjustedWord(AWord, Offset, FLastWord); FLastWord:=AWord; // Write the Index node Entry FBlockStream.WriteByte(Length(NewWord)+1); FBlockStream.WriteByte(Offset); FBlockStream.Write(NewWord[1], Length(NewWord)); FBlockStream.WriteDWord(NtoLE(ANodeOffset)); FBlockStream.WriteWord(0); if FBlockStream.Position > FIFTI_NODE_SIZE then raise Exception.Create('FIFTIMAIN Index node has written past the block!'); end; procedure TIndexNode.Flush ( NewBlockNeeded: Boolean ) ; var RemSize: DWord; begin if NewBlockNeeded then begin if ParentNode = nil then begin ParentNode := TIndexNode.Create(FWriteStream); OwnsParentNode:=True; end; end; if ParentNode <> nil then ParentNode.ChildIsFull(FLastWord, FWriteStream.Position); RemSize := RemainingSpace; FillRemainingSpace; FBlockStream.Position := 0; FBlockStream.WriteWord(NtoLE(RemSize)); FBlockStream.Position := 0; FWriteStream.CopyFrom(FBlockStream, FIFTI_NODE_SIZE); FBlockStream.Position := 0; FLastWord := ''; if NewBlockNeeded then FBlockStream.WriteDWord(0) // placeholder to write free space in when block is full else if ParentNode <> nil then ParentNode.Flush(NewBlockNeeded); end; { TChmSearchReader } procedure TChmSearchReader.ReadCommonData; var Sig: DWord; begin FStream.Position := 0; Sig := LEtoN(FStream.ReadDWord); FFileIsValid := Sig = $00280000; if not FileIsValid then Exit; // root node address FStream.Position := $8; FRootNodeOffset := LEtoN(FStream.ReadDWord); // Tree Depth FStream.Position := $18; FTreeDepth := LEtoN(FStream.ReadWord); // Root sizes for scale and root integers FStream.Position := $1E; if FStream.ReadByte <> 2 then // we only can read the files when scale is 2 FFileIsValid := False; FDocRootSize := FStream.ReadByte; if FStream.ReadByte <> 2 then FFileIsValid := False; FCodeCountRootSize := FStream.ReadByte; if FStream.ReadByte <> 2 then FFileIsValid := False; FLocCodeRootSize := FStream.ReadByte; end; procedure TChmSearchReader.MoveToFirstLeafNode; var NodeDepth: Integer; NodeOffset: DWord; LastWord: String; NewWord: String; begin NodeDepth := FTreeDepth; MoveToRootNode; while NodeDepth > 1 do begin LastWord := ''; ReadIndexNodeEntry(LastWord, NewWord, NodeOffset); Dec(NodeDepth); MoveToNode(NodeOffset, NodeDepth); end; end; procedure TChmSearchReader.MoveToRootNode; begin MoveToNode(FRootNodeOffset, FTreeDepth); end; procedure TChmSearchReader.MoveToNode(ANodeOffset: DWord; ANodeDepth: Integer); begin FStream.Position := ANodeOffset; FActiveNodeStart := FStream.Position; if ANodeDepth > 1 then begin FnextLeafNode := 0; FActiveNodeFreeSpace := LEtoN(FStream.ReadWord); // empty space at end of node end else begin FnextLeafNode := LEtoN(FStream.ReadDWord); FStream.ReadWord; FActiveNodeFreeSpace := LEtoN(FStream.ReadWord); end; end; function TChmSearchReader.ReadWordOrPartialWord ( ALastWord: String ) : String; var WordLength: Integer; CopyLastWordCharCount: Integer; begin WordLength := FStream.ReadByte; CopyLastWordCharCount := FStream.ReadByte; if CopyLastWordCharCount > 0 then Result := Copy(ALastWord, 1, CopyLastWordCharCount); SetLength(Result, (WordLength-1) + CopyLastWordCharCount); if WordLength > 1 then FStream.Read(Result[1+CopyLastWordCharCount], WordLength-1); end; function TChmSearchReader.ReadIndexNodeEntry (ALastWord: String; out AWord: String; out ASubNodeStart: DWord ): Boolean; begin Result := FStream.Position - FActiveNodeStart < FIFTI_NODE_SIZE - FActiveNodeFreeSpace; if not Result then Exit; AWord := ReadWordOrPartialWord(ALastWord); ASubNodeStart := LEtoN(FStream.ReadDWord); FStream.ReadWord; end; function TChmSearchReader.ReadLeafNodeEntry ( ALastWord: String; out AWord: String; out AInTitle: Boolean; out AWLCCount: DWord; out AWLCOffset: DWord; out AWLCSize: DWord ): Boolean; begin Result := FStream.Position - FActiveNodeStart < FIFTI_NODE_SIZE - FActiveNodeFreeSpace; if not Result then Exit; AWord := ReadWordOrPartialWord(ALastWord); AInTitle := FStream.ReadByte = 1; AWLCCount := GetCompressedIntegerBE(FStream); AWLCOffset := LEtoN(FStream.ReadDWord); FStream.ReadWord; AWLCSize := GetCompressedIntegerBE(FStream); end; function TChmSearchReader.ReadWLCEntries (AWLCCount: DWord; AWLCOffset: DWord; AWLCSize: DWord ) : TChmWLCTopicArray; var Buf: Byte; BitsInBuffer: Integer; FinalPosition: int64; function GetNextByte(): Boolean; begin Result := (FStream.Position < FinalPosition); if Result then begin Buf := FStream.ReadByte; Inc(BitsInBuffer, 8); end; end; function ShiftBuffer: Boolean; begin Buf := (Buf and $7F) shl 1; Dec(BitsInBuffer); Result := (BitsInBuffer > 0) or GetNextByte(); end; function ReadWLC(RootSize: DWord): DWord; var PrefixBits: Integer = 0; RemainingBits: Integer; // only the bits for this number not the bits in buffer begin if (BitsInBuffer = 0) then GetNextByte(); Result := (Buf and $80) shr 7; while (Buf and $80) > 0 do // find out how many prefix bits there are begin Inc(PrefixBits); if not ShiftBuffer then Exit; end; // skip divider (zero) bit if not ShiftBuffer then Exit; Remainingbits := RootSize + Max(Integer(PrefixBits-1), 0); while RemainingBits > 0 do begin Result := Result shl 1; Result := Result or (Buf shr 7); Dec(RemainingBits); if not ShiftBuffer then Exit; end; end; procedure ClearBuffer; begin if BitsInBuffer < 8 then begin BitsInBuffer := 0; Buf := 0; end; end; var TopicHits: DWord; i: Integer; j: Integer; CachedStreamPos: QWord; LastDoc, LastLocCode: DWord; begin FinalPosition := AWLCOffset + AWLCSize; CachedStreamPos := FStream.Position; FStream.Position := AWLCOffset; {for i := 0 to AWLCSize-1 do begin Buf := FStream.ReadByte; Write(binStr(Buf, 8), ' '); end;} FStream.Position := AWLCOffset; SetLength(Result, AWLCCount); Buf := 0; BitsInBuffer := 0; LastDoc := 0; for i := 0 to AWLCCount-1 do begin Result[i].TopicIndex := ReadWLC(FDocRootSize) + LastDoc; LastDoc := Result[i].TopicIndex; TopicHits := ReadWLC(FCodeCountRootSize); SetLength(Result[i].LocationCodes, TopicHits); LastLocCode := 0; for j := 0 to TopicHits-1 do begin Result[i].LocationCodes[j] := ReadWLC(FLocCodeRootSize) + LastLocCode; LastLocCode := Result[i].LocationCodes[j]; end; ClearBuffer; end; FStream.Position := CachedStreamPos; end; constructor TChmSearchReader.Create ( AStream: TStream; AFreeStreamOnDestroy: Boolean ) ; begin FStream := AStream; FFreeStreamOnDestroy := AFreeStreamOnDestroy; ReadCommonData; end; destructor TChmSearchReader.Destroy; begin if FFreeStreamOnDestroy then FreeAndNil(FStream); inherited Destroy; end; procedure TChmSearchReader.DumpData ( AFoundDataEvent: TChmSearchReaderFoundDataEvent ) ; var LastWord: String; TheWord: String; InTitle: Boolean; WLCCount: DWord; WLCOffset: DWord; WLCSize: DWord; FoundHits: TChmWLCTopicArray; i: Integer; j: Integer; begin MoveToFirstLeafNode; LastWord := ''; repeat if (ReadLeafNodeEntry(LastWord, TheWord, InTitle, WLCCount, WLCOffset, WLCSize) = False) then begin if FnextLeafNode <> 0 then begin MoveToNode(FnextLeafNode, 1); LastWord := ''; end else Break; end else begin LastWord := TheWord; //WriteLn('Reading Hits for ', TheWord ,' at ', hexstr(WLCOffset,8) ); FoundHits := ReadWLCEntries(WLCCount, WLCOffset, WLCSize); //WriteLn('DONE Reading Hits for ', TheWord); // AFoundDataEvent(Self, TheWord, 0,0);//FoundHits[i].TopicIndex ,-1);//FoundHits[i].LocationCodes[j]); for i := 0 to High(FoundHits) do for j := 0 to High(FoundHits[i].LocationCodes) do AFoundDataEvent(Self, TheWord, FoundHits[i].TopicIndex ,FoundHits[i].LocationCodes[j]); end; until False; //FStream.Position - FActiveNodeStart >= FIFTI_NODE_SIZE - FActiveNodeFreeSpace end; function TChmSearchReader.LookupWord(AWord: String; out ATitleHits: TChmWLCTopicArray; AStartsWith: Boolean = True): TChmWLCTopicArray; var LastWord: String; NewWord: String; NodeLevel: Integer; NewNodePosition: DWord; InTitle: Boolean; WLCCount: DWord; WLCOffset: DWord; WLCSize: DWord; CompareResult: Integer; ReadNextResult: Boolean; begin AWord := LowerCase(AWord); NodeLevel := FTreeDepth; MoveToRootNode; SetLength(Result, 0); LastWord := ''; // descend the index node tree until we find the leafnode while NodeLevel > 1 do begin //WriteLn('At Node Level ', NodeLevel); if ReadIndexNodeEntry(LastWord, NewWord, NewNodePosition) <> False then begin LastWord := NewWord; //WriteLn('Found Index Entry: ', NewWord, ' Comparing to ', AWord); if ChmCompareText(NewWord, AWord) >= 0 then begin LastWord := ''; Dec(NodeLevel); MoveToNode(NewNodePosition, NodeLevel); end; end else Break; end; if NodeLevel > 1 then Exit; // the entry we are looking for is > than the last entry of the last index node // now we are in a leafnode while ReadLeafNodeEntry(LastWord, NewWord, InTitle, WLCCount, WLCOffset, WLCSize) <> False do begin //WriteLn('Found Leaf Entry: ', NewWord, ' Comparing to ', AWord); LastWord := NewWord; if Length(NewWord) < Length(AWord) then continue; if AStartsWith then //it only has to start with the searched term CompareResult := ChmCompareText(AWord, Copy(NewWord, 1, Length(AWord))) else // it must match exactly CompareResult := ChmCompareText(AWord, NewWord); if CompareResult < 0 then Exit; if CompareResult = 0 then begin if InTitle then ATitleHits := ReadWLCEntries(WLCCount, WLCOffset, WLCSize) else Result := ReadWLCEntries(WLCCount, WLCOffset, WLCSize); // check if the next entry is the same word since there is an entry for titles and for body if (ReadLeafNodeEntry(LastWord, NewWord, InTitle, WLCCount, WLCOffset, WLCSize)) then ReadNextResult := True else if (FNextLeafNode <> 0) then begin MoveToNode(FNextLeafNode, 1); LastWord := ''; ReadNextResult := (ReadLeafNodeEntry(LastWord, NewWord, InTitle, WLCCount, WLCOffset, WLCSize)); end; if ReadNextResult and (NewWord = AWord) then begin if InTitle then ATitleHits := ReadWLCEntries(WLCCount, WLCOffset, WLCSize) else Result := ReadWLCEntries(WLCCount, WLCOffset, WLCSize); end; Exit; end; end; end; end.