Kills martin 2, adds more commenting to martin 1, removes the table usage from martin 1

git-svn-id: trunk@32834 -
2025-08-07 01:26:31 +02:00 · 2011-10-12 06:21:49 +00:00 · 2011-10-12 06:21:49 +00:00 · 2371688c04
commit 2371688c04
parent 3208d981ee
2 changed files with 14 additions and 303 deletions
--- a/components/lazutils/lazutf8.pas
+++ b/components/lazutils/lazutf8.pas
@ -68,7 +68,6 @@ function UnicodeLowercase(u: cardinal): cardinal;
 function UTF8LowerCaseMattias(const s: utf8string): utf8string;
 function UTF8LowerCaseMartin(const AInStr: utf8string): utf8string;
 function UTF8LowerCaseMartin(const AInStr, ALocale: utf8string): utf8string;
-function UTF8LowerCaseMartin2(const AInStr: utf8string; ALocale: utf8string=''): utf8string;
 {$endif}
 function UTF8LowerCase(const AInStr: utf8string): utf8string;
 function UTF8LowerCase(const AInStr, ALocale: utf8string): utf8string;
@ -1174,20 +1173,6 @@ begin
  Result := UTF8LowerCaseMartin(AInStr, '');
 end;

-var
-  UTF8LowerCaseMartinTable: array [char] of integer;
-
-procedure InitUTF8LowerCaseMartinTable;
-var
-  c: Char;
-begin
-  for c := #0 to #255 do begin
-    UTF8LowerCaseMartinTable[c] := 0;
-    if c in ['A'..'Z'] then UTF8LowerCaseMartinTable[c] := 1;
-    if c in [#$C3, #$C4, #$C5..#$C8, #$CE, #$D0] then UTF8LowerCaseMartinTable[c] := 2;
-  end;
-end;
-
 function UTF8LowerCaseMartin(const AInStr, ALocale: utf8string): utf8string;
 const
  ResultSizeIncr = 10;
@ -1212,10 +1197,17 @@ begin
  InStr := PChar(AInStr);
  InStrEnd := InStr + length(AInStr); // points behind last char

-  while (InStr < InStrEnd) and (UTF8LowerCaseMartinTable[InStr^] = 0)
-  do begin
+  // Does a fast initial parsing of the string to maybe avoid doing
+  // UniqueString if the resulting string will be identical
+  while (InStr < InStrEnd) do
+  begin
+    c := InStr^;
+    case c of
+    'A'..'Z',#$C3, #$C4, #$C5..#$C8, #$CE, #$D0: Break;
    // already lower, or otherwhise not affected
-    inc(InStr);
+    else
+      inc(InStr);
+    end;
  end;

  if InStr >= InStrEnd then
@ -1605,273 +1597,6 @@ begin
  end;
 end;

-function UTF8LowerCaseMartin2(const AInStr: utf8string; ALocale: utf8string=''): utf8string;
-var
-  i, CounterDiff: PtrInt;
-  InStr, InStrEnd, OutStr: PChar;
-  // Language identification
-  IsTurkish: Boolean;
-  c: Char;
-
-begin
-  Result:=AInStr;
-  InStr := PChar(AInStr);
-  InStrEnd := InStr + length(AInStr); // points behind last char
-
-
-  // TODO: can be extended for some unicode chars too
-  // use a pre-initialized (global): array [char] of boolean;
-  while (InStr < InStrEnd) and (UTF8LowerCaseMartinTable[InStr^] = 0)
-  do begin
-    // already lower, or otherwhise not affected
-    inc(InStr);
-  end;
-
-  if InStr >= InStrEnd then
-    exit;
-
-  // Language identification
-  IsTurkish := ALocale = 'tu';
-
-  UniqueString(Result);
-  OutStr := PChar(Result) + (InStr - PChar(AInStr));
-  CounterDiff := 0;
-
-  while InStr < InStrEnd do
-  begin
-    c := InStr^;
-    case c of  // if NOT TABLE
-      'A'..'Z':
-        begin
-          { First ASCII chars }
-          // Special turkish handling
-          // capital undotted I to small undotted i
-          if IsTurkish and (c = 'I') then
-          begin
-            OutStr := PChar(OutStr - PChar(Result));
-            SetLength(Result,Length(Result)+1);// Increase the buffer
-            OutStr := PtrInt(OutStr) + PChar(Result);
-            OutStr^ := #$C4;
-            inc(OutStr);
-            OutStr^ := #$B1;
-            dec(CounterDiff);
-            inc(InStr);
-            inc(OutStr);
-          end
-          else
-          begin
-            OutStr^ := chr(ord(c)+32);
-            inc(InStr);
-            inc(OutStr);
-          end;
-        end;
-
-      #$C3:
-        begin
-          // $C39F: ß already lowercase
-          if InStr[1] in [#$80..#$9E] then begin
-            if (CounterDiff <> 0) then begin
-            OutStr^  :=InStr[0];
-            OutStr[1]  := chr(ord(InStr[1]) + $20);
-          end else begin
-              OutStr[1]  := chr(ord(InStr[1]) + $20);
-            end;
-          end else begin
-            if (CounterDiff <> 0) then begin
-            OutStr^  :=InStr[0];
-            OutStr[1]  :=InStr[1];
-          end;
-          end;
-          inc(InStr, 2);
-          inc(OutStr, 2);
-        end;
-      #$C4:
-        begin
-          c := InStr[1];
-          case c of
-            #$81..#$A9, #$B2..#$B6: //0
-              begin
-                if ord(c) mod 2 = 0 then begin
-                  if (CounterDiff <> 0) then begin
-                    OutStr^  := InStr[0];
-                    OutStr[1]  := chr(ord(c) + 1);
-                end else begin
-                    OutStr[1]  := chr(ord(c) + 1);
-                  end;
-                end else begin
-                  if (CounterDiff <> 0) then begin
-                    OutStr^  :=InStr[0];
-                    OutStr[1]  :=c;
-                end;
-              end;
-              end;
-            #$B8..#$FF: //1
-              begin
-                if ord(c) mod 2 = 1 then begin
-                  if (CounterDiff <> 0) then begin
-                    OutStr^  := InStr[0];
-                    OutStr[1]  := chr(ord(c) + 1);
-                end else begin
-                    OutStr[1]  := chr(ord(c) + 1);
-                  end;
-                end else begin
-                  if (CounterDiff <> 0) then begin
-                    OutStr^  :=InStr[0];
-                    OutStr[1]  :=c;
-                end;
-              end;
-              end;
-            #$B0:
-              begin
-                OutStr^ := 'i';
-                dec(OutStr);
-                inc(CounterDiff, 1);
-              end;
-          end;
-          inc(InStr, 2);
-          inc(OutStr, 2);
-        end;
-      #$C5:
-        begin
-          c := InStr[1];
-          case c of
-            #$8A..#$B7: //0
-              begin
-                if ord(c) mod 2 = 0 then begin
-                  if (CounterDiff <> 0) then begin
-                    OutStr^  := InStr[0];
-                    OutStr[1]  := chr(ord(c) + 1);
-                end else begin
-                    OutStr[1]  := chr(ord(c) + 1);
-                  end;
-                end else begin
-                  if (CounterDiff <> 0) then begin
-                    OutStr^  :=InStr[0];
-                    OutStr[1]  :=c;
-                end;
-              end;
-              end;
-            #$00..#$88, #$B9..#$FF: //1
-              begin
-                if ord(c) mod 2 = 1 then begin
-                  if (CounterDiff <> 0) then begin
-                    OutStr^  := InStr[0];
-                    OutStr[1]  := chr(ord(c) + 1);
-                end else begin
-                    OutStr[1]  := chr(ord(c) + 1);
-                  end;
-                end else begin
-                  if (CounterDiff <> 0) then begin
-                    OutStr^  :=InStr[0];
-                    OutStr[1]  :=c;
-                end;
-              end;
-              end;
-            #$B8:  // Ÿ
-            begin
-              OutStr[0] := #$C3;
-              OutStr[1] := #$BF;
-            end;
-          end;
-          inc(InStr, 2);
-          inc(OutStr, 2);
-        end;
-      #$C6..#$C7:
-        begin
-          c := InStr[1];
-          if ord(c) mod 2 = 1 then begin
-            if (CounterDiff <> 0) then begin
-              OutStr^  := InStr[0];
-              OutStr[1]  := chr(ord(c) + 1);
-          end else begin
-              OutStr[1]  := chr(ord(c) + 1);
-            end;
-          end else begin
-            if (CounterDiff <> 0) then begin
-              OutStr^  :=InStr[0];
-              OutStr[1]  :=c;
-          end;
-          end;
-          inc(InStr, 2);
-          inc(OutStr, 2);
-        end;
-      #$C8:
-        begin
-          c := InStr[1];
-          if (c in [#$00..#$B3]) and (ord(c) mod 2 = 1) then begin
-            if (CounterDiff <> 0) then begin
-            OutStr^  := InStr[0];
-            end;
-            OutStr[1]  := chr(ord(c) + 1);
-          end else begin
-            if (CounterDiff <> 0) then begin
-              OutStr^  :=InStr[0];
-              OutStr[1]  :=c;
-          end;
-          end;
-          inc(InStr, 2);
-          inc(OutStr, 2);
-        end;
-      #$CE:
-        begin
-          c := InStr[1];
-          case c of
-            #$91..#$9F:
-              begin
-                if (CounterDiff <> 0) then begin
-                OutStr^  := InStr[0];
-                end;
-                OutStr[1]  := chr(ord(c) + $20);
-              end;
-            #$A0..#$A9:
-              begin
-                OutStr^  := chr(ord(InStr[0])+1);
-                OutStr[1]  := chr(ord(c) - $10);
-              end;
-          end;
-          inc(InStr, 2);
-          inc(OutStr, 2);
-        end;
-      #$D0:
-        begin
-          c := InStr[1];
-          case c of
-            #$80..#$8F:
-              begin
-                OutStr^  := chr(ord(InStr[0])+1);
-                OutStr[1]  := chr(ord(c) + $10);
-              end;
-            #$90..#$9F:
-              begin
-                if (CounterDiff <> 0) then begin
-                OutStr^  := InStr[0];
-                end;
-                OutStr[1]  := chr(ord(c) + $20);
-              end;
-            #$A0..#$AF:
-              begin
-                OutStr^  := chr(ord(InStr[0])+1);
-                OutStr[1]  := chr(ord(c) - $10);
-              end;
-          end;
-          inc(InStr, 2);
-          inc(OutStr, 2);
-        end;
-      else
-        begin
-          // Copy the character if the string was disaligned by previous changes
-          if (CounterDiff <> 0) then
-          OutStr^:=c;
-          inc(InStr);
-          inc(OutStr);
-        end;
-    end; // Case InStr^
-  end; // while
-
-  // Final correction of the buffer size
-  SetLength(Result,OutStr - PChar(Result));
-end;
-
 function UTF8LowerCase(const AInStr: utf8string): utf8string;
 begin
  Result := UTF8LowerCase(AInStr, '');
@ -2182,7 +1907,6 @@ initialization
  InternalInit;
  {$ifdef LAZUTF8_USE_TABLES}
  InitUnicodeTables;
-  InitUTF8LowerCaseMartinTable
  {$endif}

 end.
--- a/test/lazutils/testunicode.pas
+++ b/test/lazutils/testunicode.pas
@ -44,7 +44,6 @@ procedure AssertStringOperationUTF8LowerCase(AMsg, ALocale, AStr1, AStrExpected2
 begin
  AssertStringOperation(AMsg, AStr1, UTF8LowerCase(AStr1, ALocale), AStrExpected2);
  AssertStringOperation('MARTIN:'+AMsg, AStr1, UTF8LowerCaseMartin(AStr1, ALocale), AStrExpected2);
-  AssertStringOperation('MARTIN2:'+AMsg, AStr1, UTF8LowerCaseMartin2(AStr1, ALocale), AStrExpected2);
 end;

 function DateTimeToMilliseconds(aDateTime: TDateTime): Int64;
@ -77,6 +76,9 @@ begin
  // Cyrillic
  AssertStringOperationUTF8UpperCase('Russian UTF8UpperCase 1', '', 'АБВЕЁЖЗКЛМНОПРДЙГ СУФХЦЧШЩЪЫЬЭЮЯИТ', 'АБВЕЁЖЗКЛМНОПРДЙГ СУФХЦЧШЩЪЫЬЭЮЯИТ');
  AssertStringOperationUTF8UpperCase('Russian UTF8UpperCase 2', '', 'абвеёжзклмнопрдйг суфхцчшщъыьэюяит', 'АБВЕЁЖЗКЛМНОПРДЙГ СУФХЦЧШЩЪЫЬЭЮЯИТ');
+  AssertStringOperationUTF8UpperCase('Cyrillic UTF8UpperCase 1', '', 'Ѡѡ Ѣѣ Ѥѥ Ѧѧ Ѩѩ Ѫѫ Ѭѭ Ѯѯ Ѱѱ Ѳѳ Ѵѵ Ѷѷ Ѹѹ Ѻѻ Ѽѽ Ѿѿ Ҁҁ ', 'ѡѡ ѣѣ ѥѥ ѧѧ ѩѩ ѫѫ ѭѭ ѯѯ ѱѱ ѳѳ ѵѵ ѷѷ ѹѹ ѻѻ ѽѽ ѿѿ ҁҁ');
+  AssertStringOperationUTF8UpperCase('Cyrillic UTF8UpperCase 2', '', 'Ҋҋ Ҍҍ Ҏҏ Ґґ Ғғ Ҕҕ Җҗ Ҙҙ Ққ Ҝҝ Ҟҟ Ҡҡ Ңң Ҥҥ Ҧҧ Ҩҩ Ҫҫ Ҭҭ Үү Ұұ Ҳҳ Ҵҵ Ҷҷ Ҹҹ Һһ Ҽҽ Ҿҿ', 'ҋҋ ҍҍ ҏҏ ґґ ғғ ҕҕ җҗ ҙҙ ққ ҝҝ ҟҟ ҡҡ ңң ҥҥ ҧҧ ҩҩ ҫҫ ҭҭ үү ұұ ҳҳ ҵҵ ҷҷ ҹҹ һһ ҽҽ ҿҿ');
+
  // What shouldnt change
  AssertStringOperationUTF8UpperCase('Chinese UTF8UpperCase 1', '', '名字叫嘉英，嘉陵江的嘉，英國的英', '名字叫嘉英，嘉陵江的嘉，英國的英');

@ -134,22 +136,6 @@ begin
    Write(Format(' %7d ms ', [DateTimeToMilliseconds(lTimeDiff)]));
  end;
  writeln;
-  Write('Martin LowerCase2 Performance test took:    ');
-  for j := 0 to 5 do begin
-    lStartTime := Now;
-    for i := 0 to TimerLoop do
-    begin
-      if j = 0 then Str := UTF8LowerCaseMartin2('abcdefghijklmnopqrstuwvxyz');
-      if j = 1 then Str := UTF8LowerCaseMartin2('ABCDEFGHIJKLMNOPQRSTUWVXYZ');
-      if j = 2 then Str := UTF8LowerCaseMartin2('aąbcćdeęfghijklłmnńoóprsśtuwyzźż');
-      if j = 3 then Str := UTF8LowerCaseMartin2('AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻ');
-      if j = 4 then Str := UTF8LowerCaseMartin2('АБВЕЁЖЗКЛМНОПРДЙГ');
-      if j = 5 then Str := UTF8LowerCaseMartin2('名字叫嘉英，嘉陵江的嘉，英國的英');
-    end;
-    lTimeDiff := Now - lStartTime;
-    Write(Format(' %7d ms ', [DateTimeToMilliseconds(lTimeDiff)]));
-  end;
-  writeln;
  // Performance test
  Write('Mattias LowerCase- Performance test took:    ');
  for j := 0 to 5 do begin
@ -190,6 +176,7 @@ begin
  TestUTF8UpperCase();
  WriteLn('======= LowerCase =======');
  TestUTF8LowerCase();
+  WriteLn('Please press enter to continue');
  readln;
 end.