* Fixed several chm bugs.

* Now searching for particular words is supported instead of a full dump of the index. * Generated files don't crash the MS reader when searching. git-svn-id: trunk@12119 -
2025-09-11 19:29:35 +02:00 · 2008-11-16 05:45:19 +00:00 · 2008-11-16 05:45:19 +00:00 · d423812928
commit d423812928
parent 17a08efb82
4 changed files with 221 additions and 41 deletions
--- a/packages/chm/src/chmfiftimain.pas
+++ b/packages/chm/src/chmfiftimain.pas
@ -124,14 +124,14 @@ type
    procedure MoveToRootNode;
    procedure MoveToNode(ANodeOffset: DWord; ANodeDepth: Integer);
    function  ReadWordOrPartialWord(ALastWord: String): String; // returns the whole word using the last word as a base
-    procedure ReadRootNodeEntry(ALastWord: String; out AWord: String; out ASubNodeStart: DWord);
+    function  ReadIndexNodeEntry(ALastWord: String; out AWord: String; out ASubNodeStart: DWord): Boolean;
    function  ReadLeafNodeEntry(ALastWord: String; out AWord: String; out AInTitle: Boolean; out AWLCCount: DWord; out AWLCOffset: DWord; out AWLCSize: DWord): Boolean;
    function  ReadWLCEntries(AWLCCount: DWord; AWLCOffset: DWord; AWLCSize: DWord): TChmWLCTopicArray;
  public
    constructor Create(AStream: TStream; AFreeStreamOnDestroy: Boolean);
    destructor  Destroy; override;
    procedure   DumpData(AFoundDataEvent: TChmSearchReaderFoundDataEvent);
-    function    LookupWord(AWord: String): TChmWLCTopicArray;
+    function    LookupWord(AWord: String; out ATitleHits: TChmWLCTopicArray): TChmWLCTopicArray;
    property    FileIsValid: Boolean read FFileIsValid;
  end;

@ -172,6 +172,44 @@ type
    property LocRootSize: Byte read FLocRootSize write FLocRootSize;
  end;

+function GetCompressedIntegerBE(Stream: TStream): DWord;
+var
+  Buf: Byte;
+  Value: Dword = 0;
+  Shift: Integer = 0;
+begin
+  repeat
+    Buf := Stream.ReadByte;
+    Value := Value or  (Buf and $7F) shl Shift;
+    Inc(Shift, 7);
+  until (Buf and $80) = 0;
+  Result := Value;
+end;
+
+procedure WriteCompressedIntegerBE(Stream: TStream; AInt: DWord);
+var
+  Bits: Integer;
+  Tmp: DWord;
+  Buf: Byte;
+begin
+  Tmp := AInt;
+  Bits := 0;
+  while Tmp <> 0 do
+  begin
+    Tmp := Tmp shr 1;
+    Inc(Bits);
+  end;
+
+  repeat
+    Buf := (AInt shr (Tmp * 7)) and $7F;
+    if Bits > 7 then
+      Buf := Buf or $80;
+    Dec(Bits, 7);
+    Inc(Tmp);
+    Stream.WriteByte(Buf);
+  until Bits <= 0;
+end;
+
 function WriteScaleRootInt(ANumber: DWord; out Bits: DWord; Root: Integer): Byte;
 var
  Tmp: DWord;
@ -486,14 +524,14 @@ begin
  FBlockStream.WriteByte(Offset);
  FBlockStream.Write(NewWord[1], Length(Trim(NewWord)));
  FBlockStream.WriteByte(Ord(AWord.IsTitle));
-  WriteCompressedInteger(FBlockStream, AWord.DocumentCount);
+  WriteCompressedIntegerBE(FBlockStream, AWord.DocumentCount);
  FBlockStream.WriteDWord(NtoLE(DWord(FWriteStream.Position)));
  FBlockStream.WriteWord(0);

  // write WLC to FWriteStream so we can write the size of the wlc entries
  WLCSize := WriteWLCEntries(AWord, FDocRootSize, FCodeRootSize, FLocRootSize);

-  WriteCompressedInteger(FBlockStream, WLCSize);
+  WriteCompressedIntegerBE(FBlockStream, WLCSize);
 end;

 function Min(AValue, BValue: Byte): Byte;
@ -530,7 +568,6 @@ function TLeafNode.WriteWLCEntries ( AWord: TIndexedWord ; ADocRootSize, ACodeRo
 var
  LastDocIndex: DWord;
  LastLocCode: DWord;
-  WLCLastWord: String;
  UsedBits: Byte;
  Buf: Byte;
  function NewDocDelta(ADocIndex: DWord): DWord;
@ -720,7 +757,7 @@ begin
  while NodeDepth > 1 do
  begin
    LastWord := '';
-    ReadRootNodeEntry(LastWord, NewWord, NodeOffset);
+    ReadIndexNodeEntry(LastWord, NewWord, NodeOffset);
    Dec(NodeDepth);
    MoveToNode(NodeOffset, NodeDepth);
  end;
@ -761,28 +798,30 @@ begin
  FStream.Read(Result[1+CopyLastWordCharCount], WordLength-1);
 end;

-procedure TChmSearchReader.ReadRootNodeEntry (ALastWord: String;  out AWord: String; out
-  ASubNodeStart: DWord ) ;
+function TChmSearchReader.ReadIndexNodeEntry (ALastWord: String;  out AWord: String; out
+  ASubNodeStart: DWord ): Boolean;
 begin
+  Result := FStream.Position - FActiveNodeStart < FIFTI_NODE_SIZE - FActiveNodeFreeSpace;
+  if not Result then
+    Exit;
  AWord := ReadWordOrPartialWord(ALastWord);
  ASubNodeStart := LEtoN(FStream.ReadDWord);
+  FStream.ReadWord;
 end;

 function TChmSearchReader.ReadLeafNodeEntry ( ALastWord: String; out
  AWord: String; out AInTitle: Boolean; out AWLCCount: DWord; out
  AWLCOffset: DWord; out AWLCSize: DWord ): Boolean;
-var
-  WordLength: Integer;
 begin
  Result := FStream.Position - FActiveNodeStart < FIFTI_NODE_SIZE - FActiveNodeFreeSpace;
  if not Result then
    Exit;
  AWord := ReadWordOrPartialWord(ALastWord);
  AInTitle := FStream.ReadByte = 1;
-  AWLCCount := GetCompressedInteger(FStream);
+  AWLCCount := GetCompressedIntegerBE(FStream);
  AWLCOffset := LEtoN(FStream.ReadDWord);
  FStream.ReadWord;
-  AWLCSize := GetCompressedInteger(FStream);
+  AWLCSize := GetCompressedIntegerBE(FStream);

 end;

@ -857,13 +896,12 @@ var
 begin
  CachedStreamPos := FStream.Position;
  FStream.Position := AWLCOffset;
-  for i := 0 to AWLCSize-1 do
+  {for i := 0 to AWLCSize-1 do
  begin
    Buf := FStream.ReadByte;
    Write(binStr(Buf, 8), ' ');
-  end;
+  end;}
  FStream.Position := AWLCOffset;
-
  SetLength(Result, AWLCCount);
  Buf := 0;
  BitsInBuffer := 0;
@ -932,7 +970,10 @@ begin
    end
    else begin
      LastWord := TheWord;
+      //WriteLn('Reading Hits for ', TheWord ,' at ', hexstr(WLCOffset,8) );
      FoundHits := ReadWLCEntries(WLCCount, WLCOffset, WLCSize);
+      //WriteLn('DONE Reading Hits for ', TheWord);
+     // AFoundDataEvent(Self, TheWord, 0,0);//FoundHits[i].TopicIndex ,-1);//FoundHits[i].LocationCodes[j]);
      for i := 0 to High(FoundHits) do
        for j := 0 to High(FoundHits[i].LocationCodes) do
           AFoundDataEvent(Self, TheWord, FoundHits[i].TopicIndex ,FoundHits[i].LocationCodes[j]);
@ -940,16 +981,79 @@ begin
  until False; //FStream.Position - FActiveNodeStart >= FIFTI_NODE_SIZE - FActiveNodeFreeSpace
 end;

-function TChmSearchReader.LookupWord(AWord: String): TChmWLCTopicArray;
+function TChmSearchReader.LookupWord(AWord: String; out ATitleHits: TChmWLCTopicArray): TChmWLCTopicArray;
+var
+  LastWord: String;
+  NewWord: String;
+  NodeLevel: Integer;
+  NewNodePosition: DWord;
+  InTitle: Boolean;
+  WLCCount: DWord;
+  WLCOffset: DWord;
+  WLCSize: DWord;
+  CompareResult: Integer;
+  ReadNextResult: Boolean;
 begin
-{  if not AIsReadyToReadWLC then
-  begin
+  AWord := LowerCase(AWord);
+  NodeLevel := FTreeDepth;
+  MoveToRootNode;
+  SetLength(Result, 0);
+  LastWord := '';
+  // descend the index node tree until we find the leafnode
+  while NodeLevel > 1 do begin
+     //WriteLn('At Node Level ', NodeLevel);
+     if ReadIndexNodeEntry(LastWord, NewWord, NewNodePosition) <> False then
+     begin
+       //WriteLn('Found Index Entry: ', NewWord, ' Comparing to ', AWord);
+       if ChmCompareText(NewWord, AWord) >= 0 then
+       begin
+         LastWord := '';
+         Dec(NodeLevel);
+         MoveToNode(NewNodePosition, NodeLevel);
+       end;
+     end
+     else
+       Break;
+  end;
+  if NodeLevel > 1 then
+    Exit; // the entry we are looking for is > than the last entry of the last index node

-  end
-  else begin
-     //ReadWLCEntries();
-  end;}
+  // now we are in a leafnode
+  while ReadLeafNodeEntry(LastWord, NewWord, InTitle, WLCCount, WLCOffset, WLCSize) <> False do
+  begin
+    //WriteLn('Found Leaf Entry: ', NewWord, ' Comparing to ', AWord);
+    LastWord := NewWord;
+    CompareResult := ChmCompareText(AWord, NewWord);
+    if CompareResult < 0 then
+      Exit;
+    if CompareResult = 0 then
+    begin
+      if InTitle then
+        ATitleHits := ReadWLCEntries(WLCCount, WLCOffset, WLCSize)
+      else
+        Result := ReadWLCEntries(WLCCount, WLCOffset, WLCSize);
+      // check if the next entry is the same word since there is an entry for titles and for body
+
+      if  (ReadLeafNodeEntry(LastWord, NewWord, InTitle, WLCCount, WLCOffset, WLCSize)) then
+        ReadNextResult := True
+      else if (FNextLeafNode <> 0) then
+      begin
+        MoveToNode(FNextLeafNode, 1);
+        LastWord := '';
+        ReadNextResult := (ReadLeafNodeEntry(LastWord, NewWord, InTitle, WLCCount, WLCOffset, WLCSize));
+      end;
+      if ReadNextResult and (NewWord = AWord) then
+      begin
+        if InTitle then
+          ATitleHits := ReadWLCEntries(WLCCount, WLCOffset, WLCSize)
+        else
+          Result := ReadWLCEntries(WLCCount, WLCOffset, WLCSize);
+      end;
+      Exit;
+    end;
+  end;
 end;


 end.
+
--- a/packages/chm/src/chmreader.pas
+++ b/packages/chm/src/chmreader.pas
@ -28,7 +28,7 @@ unit chmreader;
 interface

 uses
-  Classes, SysUtils, chmbase, paslzx;
+  Classes, SysUtils, chmbase, paslzx, chmFIftiMain;
  
 type

@ -99,14 +99,22 @@ type
    fTitle: String;
    fPreferedFont: String;
    fContextList: TContextList;
+    fTOPICSStream,
+    fURLSTRStream,
+    fURLTBLStream,
+    fStringsStream: TMemoryStream;
    fLocaleID: DWord;
  private
+    FSearchReader: TChmSearchReader;
    procedure ReadCommonData;
+    function  ReadStringsEntry(APosition: DWord): String;
+    function  ReadURLSTR(APosition: DWord): String;
  public
    constructor Create(AStream: TStream; FreeStreamOnDestroy: Boolean); override;
    destructor Destroy; override;
  public
    function GetContextUrl(Context: THelpContext): String;
+    function LookupTopicByID(ATopicID: Integer; out ATitle: String): String; // returns a url
    function HasContextList: Boolean;
    property DefaultPage: String read fDefaultPage;
    property IndexFile: String read fIndexFile;
@ -114,6 +122,7 @@ type
    property Title: String read fTitle write fTitle;
    property PreferedFont: String read fPreferedFont;
    property LocaleID: dword read fLocaleID;
+    property SearchReader: TChmSearchReader read FSearchReader write FSearchReader;
  end;

  { TChmFileList }
@ -430,6 +439,41 @@ begin
   {$ENDIF}
 end;

+function TChmReader.ReadStringsEntry ( APosition: DWord ) : String;
+begin
+  Result := '';
+  if fStringsStream = nil then
+    fStringsStream := GetObject('/#STRINGS');
+  if fStringsStream = nil then
+    Exit;
+  if APosition < fStringsStream.Size-1 then
+  begin
+    Result := PChar(fStringsStream.Memory+APosition);
+  end;
+end;
+
+function TChmReader.ReadURLSTR ( APosition: DWord ) : String;
+var
+  URLStrURLOffset: DWord;
+begin
+  if fURLSTRStream = nil then
+    fURLSTRStream := GetObject('/#URLSTR');
+  if fURLTBLStream = nil then
+    fURLTBLStream := GetObject('/#URLTBL');
+  if (fURLTBLStream <> nil) and (fURLSTRStream <> nil) then
+  begin
+
+    fURLTBLStream.Position := APosition;
+    fURLTBLStream.ReadDWord; // unknown
+    fURLTBLStream.ReadDWord; // TOPIC index #
+    fURLSTRStream.Position := LEtoN(fURLTBLStream.ReadDWord);
+    fURLSTRStream.ReadDWord;
+    fURLSTRStream.ReadDWord;
+    if fURLSTRStream.Position < fURLSTRStream.Size-1 then
+      Result := '/'+PChar(fURLSTRStream.Memory+fURLSTRStream.Position);
+  end;
+end;
+
 constructor TChmReader.Create(AStream: TStream; FreeStreamOnDestroy: Boolean);
 begin
  inherited Create(AStream, FreeStreamOnDestroy);
@ -442,6 +486,11 @@ end;
 destructor TChmReader.Destroy;
 begin
  fContextList.Free;
+  FreeAndNil(FSearchReader);
+  FreeAndNil(fTOPICSStream);
+  FreeAndNil(fURLSTRStream);
+  FreeAndNil(fURLTBLStream);
+  FreeAndNil(fStringsStream);
  inherited Destroy;
 end;

@ -787,6 +836,31 @@ begin
 Result := fContextList.GetURL(Context);
 end;

+function TChmReader.LookupTopicByID ( ATopicID: Integer; out ATitle: String) : String;
+var
+  TopicURLTBLOffset: DWord;
+  TopicTitleOffset: DWord;
+begin
+  Result := '';
+  ATitle := '';
+  //WriteLn('Getting topic# ',ATopicID);
+  if fTOPICSStream = nil then;
+    fTOPICSStream := GetObject('/#TOPICS');
+  if fTOPICSStream = nil then
+    Exit;
+  fTOPICSStream.Position := ATopicID * 16;
+  if fTOPICSStream.Position = ATopicID * 16 then
+  begin
+    fTOPICSStream.ReadDWord;
+    TopicTitleOffset := LEtoN(fTOPICSStream.ReadDWord);
+    TopicURLTBLOffset := LEtoN(fTOPICSStream.ReadDWord);
+    if TopicTitleOffset <> $FFFFFFFF then
+      ATitle := ReadStringsEntry(TopicTitleOffset);
+     //WriteLn('Got a title: ', ATitle);
+    Result := ReadURLSTR(TopicURLTBLOffset);
+  end;
+end;
+
 function TChmReader.HasContextList: Boolean;
 begin
  Result := fContextList.Count > 0;
--- a/packages/chm/src/chmwriter.pas
+++ b/packages/chm/src/chmwriter.pas
@ -50,6 +50,7 @@ type
    FCurrentStream: TStream; // used to buffer the files that are to be compressed
    FCurrentIndex: Integer;
    FOnGetFileData: TGetDataFunc;
+    FSearchTitlesOnly: Boolean;
    FStringsStream: TMemoryStream; // the #STRINGS file
    FTopicsStream: TMemoryStream;  // the #TOPICS file
    FURLTBLStream: TMemoryStream;  // the #URLTBL file. has offsets of strings in URLSTR
@ -130,6 +131,7 @@ type
    property OutStream: TStream read FOutStream;
    property Title: String read FTitle write FTitle;
    property FullTextSearch: Boolean read FFullTextSearch write FFullTextSearch;
+    property SearchTitlesOnly: Boolean read FSearchTitlesOnly write FSearchTitlesOnly;
    property DefaultFont: String read FDefaultFont write FDefaultFont;
    property DefaultPage: String read FDefaultPage write FDefaultPage;
    property TempRawStream: TStream read FTempStream write SetTempRawStream;
@ -404,7 +406,6 @@ var
  Entry: TFileEntryRec;
  TmpStr: String;
  TmpTitle: String;
-  TmpStream: TMemoryStream;
 const
  VersionStr = 'HHA Version 4.74.8702'; // does this matter?
 begin
@ -591,9 +592,7 @@ end;

 procedure TChmWriter.WriteOBJINST;
 var
-  Entry: TFileEntryRec;
  i: Integer;
-  TmpPos: Integer;
  ObjStream: TMemoryStream;
  //Flags: Word;
 begin
@ -832,7 +831,7 @@ function TChmWriter.AddURL ( AURL: String; TopicsIndex: DWord ) : LongWord;
    Len: LongWord;
  begin
    Rem := $4000 - (FURLSTRStream.Size mod $4000);
-    Len := 9 + Length(AString);
+    Len := 9 + Length(AString);  // 2 dwords the string and NT
    if Rem < Len then
      while Rem > 0 do
      begin
@ -847,17 +846,18 @@ function TChmWriter.AddURL ( AURL: String; TopicsIndex: DWord ) : LongWord;
    if FURLSTRStream.Size mod $4000 = 0 then
      FURLSTRStream.WriteByte(0);
      Result := FURLSTRStream.Position;
-      FURLSTRStream.WriteDWord(NToLE(DWord(0))); // URL Offset for topic??
+      FURLSTRStream.WriteDWord(NToLE(DWord(0))); // URL Offset for topic after the the "Local" value
      FURLSTRStream.WriteDWord(NToLE(DWord(0))); // Offset of FrameName??
      FURLSTRStream.Write(AString[1], Length(AString));
      FURLSTRStream.WriteByte(0); //NT
  end;
 begin
  if AURL[1] = '/' then Delete(AURL,1,1);
-  if $1000 - (FURLTBLStream.Size mod $1000) = 4 then
-    FURLTBLStream.WriteDWord(NtoLE(DWord(4096)));
+  //if $1000 - (FURLTBLStream.Size mod $1000) = 4 then // we are at 4092
+  if FURLTBLStream.Size and $FFC = $FFC then // faster :)
+    FURLTBLStream.WriteDWord(0);
  Result := FURLTBLStream.Position;
-  FURLTBLStream.WriteDWord($231e9f5c); //unknown
+  FURLTBLStream.WriteDWord(0);//($231e9f5c); //unknown
  FURLTBLStream.WriteDWord(NtoLE(TopicsIndex)); // Index of topic in #TOPICS
  FURLTBLStream.WriteDWord(NtoLE(AddURLString(AURL)));
 end;
@ -1007,7 +1007,7 @@ type
 begin
  if Pos('.ht', AFileEntry.Name) > 0 then
  begin
-    ATitle := FIndexedFiles.IndexFile(AStream, GetNewTopicsIndex);
+    ATitle := FIndexedFiles.IndexFile(AStream, GetNewTopicsIndex, FSearchTitlesOnly);
    if ATitle <> '' then
      TopicEntry.StringsOffset := AddString(ATitle)
    else
--- a/packages/chm/src/htmlindexer.pas
+++ b/packages/chm/src/htmlindexer.pas
@ -68,6 +68,7 @@ Type

  TIndexedWordList = class(TObject)
  private
+    FIndexTitlesOnly: Boolean;
    FIndexedFileCount: DWord;
    //vars while processing page
    FInTitle,
@ -83,6 +84,7 @@ Type
    FLongestWord: DWord;
    FFirstWord: TIndexedWord;
    FCachedWord: TIndexedWord;
+    FParser: THTMLParser;
    function AddGetWord(AWord: String; IsTitle: Boolean): TIndexedWord;
    function GetWordForward(AWord: String; StartWord: TIndexedWord; out WrongWord: TIndexedWord; AIsTitle: Boolean): TIndexedWord;
    function GetWordBackward(AWord: String; StartWord: TIndexedWord; out WrongWord: TIndexedWord; AIsTitle: Boolean): TIndexedWord;
@ -95,7 +97,7 @@ Type
  public
    constructor Create;
    destructor  Destroy; override;
-    function  IndexFile(AStream: TStream; ATOPICIndex: Integer): String; // returns the documents <Title>
+    function  IndexFile(AStream: TStream; ATOPICIndex: Integer; AIndexOnlyTitles: Boolean): String; // returns the documents <Title>
    procedure Clear;
    procedure AddWord(const AWord: TIndexedWord; StartingWord: TIndexedWord; AIsTitle: Boolean);
    property FirstWord: TIndexedWord read FFirstWord;
@ -231,7 +233,7 @@ begin
    else if NoCaseTag = '<BODY>' then FInBody := True
    else
  end;
-
+  if FInBody and FIndexTitlesOnly then FParser.Done := True;
 end;

 procedure TIndexedWordList.CBFountText(Text: string);
@ -325,13 +327,13 @@ begin
  inherited Destroy;
 end;

-function TIndexedWordList.IndexFile(AStream: TStream; ATOPICIndex: Integer): String;
+function TIndexedWordList.IndexFile(AStream: TStream; ATOPICIndex: Integer; AIndexOnlyTitles: Boolean): String;
 var
  TheFile: String;
-  Parser: THTMLParser;
 begin
  FInBody := False;
  FInTitle:= False;
+  FIndexTitlesOnly := AIndexOnlyTitles;
  FWordCount := 0;
  FTopicIndex := ATOPICIndex;
  FIndexedFileCount := FIndexedFileCount +1;
@ -341,11 +343,11 @@ begin
  AStream.Read(TheFile[1], AStream.Size);
  TheFile[Length(TheFile)] := #0;

-  Parser := THTMLParser.Create(@TheFile[1]);
-  Parser.OnFoundTag := @CBFoundTag;
-  Parser.OnFoundText := @CBFountText;
-  Parser.Exec;
-  Parser.Free;
+  FParser := THTMLParser.Create(@TheFile[1]);
+  FParser.OnFoundTag := @CBFoundTag;
+  FParser.OnFoundText := @CBFountText;
+  FParser.Exec;
+  FParser.Free;

  Result := FDocTitle;
  FDocTitle := '';