Large overhaul to simplify the lowercase1 code

git-svn-id: trunk@32861 -
2025-06-23 17:48:12 +02:00 · 2011-10-12 16:38:56 +00:00 · 2011-10-12 16:38:56 +00:00 · 7914ea9804
commit 7914ea9804
parent 443e4b5478
2 changed files with 175 additions and 270 deletions
--- a/components/lazutils/lazutf8.pas
+++ b/components/lazutils/lazutf8.pas
@ -69,8 +69,7 @@ function UTF8LowerCaseMattias(const s: utf8string): utf8string;
 {$endif}
 function UTF8LowerCase(const AInStr: utf8string; ALocale: utf8string=''): utf8string;
 function UTF8LowerCase2(const AInStr: utf8string; ALocale: utf8string=''): utf8string;
-function UTF8UpperCase(const AInStr: utf8string): utf8string;
+function UTF8UpperCase(const AInStr: utf8string; ALocale: utf8string=''): utf8string;
 function UTF8UpperCase(const AInStr, ALocale: utf8string): utf8string;
 {function FindInvalidUTF8Character(p: PChar; Count: PtrInt;
 //                                  StopOnNonASCII: Boolean = false): PtrInt;
 //function ValidUTF8String(const s: String): String;
@ -1461,7 +1460,8 @@ begin
      case c of  // if NOT TABLE
        #0:
        begin
-            if InStr >= InStrEnd then begin
+          if InStr >= InStrEnd then
          begin
            SetLength(Result,OutStr - PChar(Result));
            exit;
          end;
@ -1526,7 +1526,6 @@ begin
          end;
      end; // Case InStr^
    end;
  end;
 end;
@ -1537,14 +1536,17 @@ end;
  Data from here: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
  List of ranges which have lowercase:
-  $0041..$0061  ASCII
+
  codepoints    UTF-8 range           Description
  $0041..$0061  $41..$61              ASCII
  $00C0..$00DE: Result:=UnicodeLower00C0_00DE[u];
  $0100..$024E: Result:=UnicodeLower0100_024E[u];
  $0386..$03AB: Result:=UnicodeLower0386_03AB[u];
  $03D8..$042F: Result:=UnicodeLower03D8_042F[u];
  $0460..$0512: Result:=UnicodeLower0460_0512[u];
  $0531..$0556: Result:=u+48;
-  $10A0..$10C5  Georgian
+  $10A0..$10C5  E1 82 A0..E1 82 BF
                E1 83 80..E1 83 85    Georgian
  $1E00..$1FFC: Result:=UnicodeLower1E00_1FFC[u];
  $2126..$2183: Result:=UnicodeLower2126_2183[u];
  $24B6..$24CF: Result:=u+26;
@ -1558,7 +1560,7 @@ var
  InStr, InStrEnd, OutStr: PChar;
  // Language identification
  IsTurkish: Boolean;
-  c, c2: Char;
+  c1, c2, c3, new_c1: Char;
 begin
  Result:=AInStr;
  InStr := PChar(AInStr);
@ -1568,8 +1570,8 @@ begin
  // UniqueString if the resulting string will be identical
  while (InStr < InStrEnd) do
  begin
-    c := InStr^;
+    c1 := InStr^;
-    case c of
+    case c1 of
    'A'..'Z',#$C3, #$C4, #$C5..#$C8, #$CE, #$D0..#$D2, #$E1: Break;
    // already lower, or otherwhise not affected
    else
@ -1588,14 +1590,15 @@ begin
  while InStr < InStrEnd do
  begin
-    c := InStr^;
+    c1 := InStr^;
-    case c of  // if NOT TABLE
+    new_c1 := c1;
    case c1 of
      'A'..'Z':
      begin
        { First ASCII chars }
        // Special turkish handling
        // capital undotted I to small undotted i
-        if IsTurkish and (c = 'I') then
+        if IsTurkish and (c1 = 'I') then
        begin
          OutStr := PChar(OutStr - PChar(Result));
          SetLength(Result,Length(Result)+1);// Increase the buffer
@ -1604,34 +1607,27 @@ begin
          inc(OutStr);
          OutStr^ := #$B1;
          dec(CounterDiff);
          inc(InStr);
          inc(OutStr);
        end
        else
        begin
-          OutStr^ := chr(ord(c)+32);
+          OutStr^ := chr(ord(c1)+32);
        end;
        inc(InStr);
        inc(OutStr);
      end;
      end;
      // Chars with 2-bytes which might be modified
      #$C3, #$C4, #$C5..#$C8, #$CE, #$D0..#$D2:
      begin
        c2 := InStr[1];
        case c1 of
        // Latin Characters 0000–0FFF http://en.wikibooks.org/wiki/Unicode/Character_reference/0000-0FFF
        // $C380..$C39E: NewChar := OldChar + $20;
        // $C39F: ß already lowercase
        #$C3:
        begin
-        if InStr[1] in [#$80..#$9E] then
+          if c2 in [#$80..#$9E] then
-        begin
+            c2 := chr(ord(c2) + $20);
          if (CounterDiff <> 0) then OutStr^ := InStr[0];
          OutStr[1]  := chr(ord(InStr[1]) + $20);
        end
        else if (CounterDiff <> 0) then
        begin
          OutStr^  :=InStr[0];
          OutStr[1]  :=InStr[1];
        end;
        inc(InStr, 2);
        inc(OutStr, 2);
        end;
        // $C481..$C4A9: if OldChar mod 2 = 0 then NewChar := OldChar + 1;
        // Turkish capital dotted i to small dotted i
@ -1642,43 +1638,26 @@ begin
        // $C4B8..$C588: if OldChar mod 2 = 1 then NewChar := OldChar + 1;
        #$C4:
        begin
-        c := InStr[1];
+          case c2 of
        case c of
            #$81..#$A9, #$B2..#$B6: //0
            begin
-            if ord(c) mod 2 = 0 then
+              if ord(c2) mod 2 = 0 then
-            begin
+                c2 := chr(ord(c2) + 1);
              if (CounterDiff <> 0) then OutStr^  := InStr[0];
              OutStr[1]  := chr(ord(c) + 1);
            end
            else if (CounterDiff <> 0) then
            begin
              OutStr^  :=InStr[0];
              OutStr[1]  :=c;
            end;
            end;
            #$B8..#$FF: //1
            begin
-            if ord(c) mod 2 = 1 then
+              if ord(c2) mod 2 = 1 then
-            begin
+                c2 := chr(ord(c2) + 1);
              if (CounterDiff <> 0) then OutStr^  := InStr[0];
              OutStr[1]  := chr(ord(c) + 1);
            end
            else if (CounterDiff <> 0) then
            begin
              OutStr^  :=InStr[0];
              OutStr[1]  :=c;
            end;
            end;
            #$B0:
            begin
              OutStr^ := 'i';
            dec(OutStr);
            inc(CounterDiff, 1);
          end;
        end;
              inc(InStr, 2);
-        inc(OutStr, 2);
+              inc(OutStr);
              inc(CounterDiff, 1);
              Continue;
            end;
          end;
        end;
        // $C589 ŉ => ?
        // $C58A..$C5B7: if OldChar mod 2 = 0 then NewChar := OldChar + 1;
@ -1686,174 +1665,105 @@ begin
        // $C5B9..$C8B3: if OldChar mod 2 = 1 then NewChar := OldChar + 1;
        #$C5:
        begin
-        c := InStr[1];
+          case c2 of
        case c of
            #$8A..#$B7: //0
            begin
-            if ord(c) mod 2 = 0 then
+              if ord(c2) mod 2 = 0 then
-            begin
+                c2 := chr(ord(c2) + 1);
              if (CounterDiff <> 0) then OutStr^  := InStr[0];
              OutStr[1]  := chr(ord(c) + 1);
            end
            else if (CounterDiff <> 0) then
            begin
              OutStr^ := InStr[0];
              OutStr[1] :=c;
            end;
            end;
            #$00..#$88, #$B9..#$FF: //1
            begin
-            if ord(c) mod 2 = 1 then
+              if ord(c2) mod 2 = 1 then
-            begin
+                c2 := chr(ord(c2) + 1);
              if (CounterDiff <> 0) then OutStr^  := InStr[0];
              OutStr[1]  := chr(ord(c) + 1);
            end
            else if (CounterDiff <> 0) then
            begin
              OutStr^ := InStr[0];
              OutStr[1] :=c;
            end;
            end;
            #$B8:  // Ÿ
            begin
-            OutStr[0] := #$C3;
+              new_c1 := #$C3;
-            OutStr[1] := #$BF;
+              c2 := #$BF;
            end;
          end;
        inc(InStr, 2);
        inc(OutStr, 2);
        end;
        #$C6..#$C7:
        begin
-        c := InStr[1];
+          if ord(c2) mod 2 = 1 then
-        if ord(c) mod 2 = 1 then
+            c2 := chr(ord(c2) + 1);
        begin
          if (CounterDiff <> 0) then OutStr^  := InStr[0];
          OutStr[1]  := chr(ord(c) + 1);
        end
        else if (CounterDiff <> 0) then
        begin
          OutStr^ := InStr[0];
          OutStr[1] :=c;
        end;
        inc(InStr, 2);
        inc(OutStr, 2);
        end;
        #$C8:
        begin
-        c := InStr[1];
+          if (c2 in [#$00..#$B3]) and (ord(c2) mod 2 = 1) then
-        if (c in [#$00..#$B3]) and (ord(c) mod 2 = 1) then
+            c2 := chr(ord(c2) + 1);
        begin
          if (CounterDiff <> 0) then OutStr^  := InStr[0];
          OutStr[1] := chr(ord(c) + 1);
        end
        else if (CounterDiff <> 0) then
        begin
          OutStr^ := InStr[0];
          OutStr[1] :=c;
        end;
        inc(InStr, 2);
        inc(OutStr, 2);
        end;
        // $CE91..$CE9F: NewChar := OldChar + $20; // Greek Characters
        // $CEA0..$CEA9: NewChar := OldChar + $E0; // Greek Characters
        #$CE:
        begin
-        c := InStr[1];
+          case c2 of
        case c of
            #$91..#$9F:
            begin
-            if (CounterDiff <> 0) then OutStr^  := InStr[0];
+              c2 := chr(ord(c2) + $20);
            OutStr[1]  := chr(ord(c) + $20);
            end;
            #$A0..#$A9:
            begin
-            OutStr^ := chr(ord(InStr[0])+1);
+              c2 := chr(ord(c2) - $10);
            OutStr[1]  := chr(ord(c) - $10);
            end;
          end;
        inc(InStr, 2);
        inc(OutStr, 2);
        end;
        // $D080..$D08F: NewChar := OldChar + $110; // Cyrillic alphabet
        // $D090..$D09F: NewChar := OldChar + $20; // Cyrillic alphabet
        // $D0A0..$D0AF: NewChar := OldChar + $E0; // Cyrillic alphabet
        #$D0:
        begin
-        c := InStr[1];
+          c2 := InStr[1];
-        case c of
+          case c2 of
            #$80..#$8F:
            begin
-            OutStr^  := chr(ord(InStr[0])+1);
+              new_c1 := chr(ord(c1)+1);
-            OutStr[1]  := chr(ord(c) + $10);
+              c2  := chr(ord(c2) + $10);
            end;
            #$90..#$9F:
            begin
-            if (CounterDiff <> 0) then OutStr^  := InStr[0];
+              c2 := chr(ord(c2) + $20);
            OutStr[1]  := chr(ord(c) + $20);
            end;
            #$A0..#$AF:
            begin
-            OutStr^  := chr(ord(InStr[0])+1);
+              new_c1 := chr(ord(c1)+1);
-            OutStr[1]  := chr(ord(c) - $20);
+              c2 := chr(ord(c2) - $20);
            end;
          end;
        inc(InStr, 2);
        inc(OutStr, 2);
        end;
        // Archaic and non-slavic cyrillic 460-47F = D1A0-D1BF
        // These require just adding 1 to get the lowercase
        #$D1:
        begin
-        c := InStr[1];
+          if (c2 in [#$A0..#$BF]) and (ord(c2) mod 2 = 0) then
-        if (c in [#$A0..#$BF]) and (ord(c) mod 2 = 0) then
+            c2 := chr(ord(c2) + 1);
        begin
          if (CounterDiff <> 0) then OutStr^ := InStr[0];
          OutStr[1] := chr(ord(c) + 1);
        end
        else if (CounterDiff <> 0) then
        begin
          OutStr^ := InStr[0];
          OutStr[1] := c;
        end;
        inc(InStr, 2);
        inc(OutStr, 2);
        end;
        // Archaic and non-slavic cyrillic 480-4BF = D280-D2BF
        // These mostly require just adding 1 to get the lowercase
        #$D2:
        begin
-        c := InStr[1];
+          case c2 of
        case c of
            #$80:
            begin
-            if (CounterDiff <> 0) then OutStr^  := InStr[0];
+              c2 := chr(ord(c2) + 1);
            OutStr[1] := chr(ord(c) + 1);
            end;
            // #$81 is already lowercase
            // #$82-#$89 ???
          #$81..#$89:
          begin
            if (CounterDiff <> 0) then
            begin
              OutStr^ := InStr[0];
              OutStr[1] := c;
            end;
          end;
            #$8A..#$BF:
            begin
-            if ord(c) mod 2 = 0 then
+              if ord(c2) mod 2 = 0 then
-            begin
+                c2 := chr(ord(c2) + 1);
              if (CounterDiff <> 0) then OutStr^  := InStr[0];
              OutStr[1] := chr(ord(c) + 1);
            end
            else if (CounterDiff <> 0) then
            begin
              OutStr^ := InStr[0];
              OutStr[1] := c;
            end;
          end;
        end;
        end;
        // Common code 2-byte modifiable chars
        //if (CounterDiff <> 0) then
        //begin
        if (new_c1 <> c1) or (CounterDiff <> 0) then
          OutStr^ := new_c1;
        OutStr[1] := c2;
        //end;
        inc(InStr, 2);
        inc(OutStr, 2);
      end;
@ -1863,19 +1773,19 @@ begin
      // E1 83 80 - E1 83 85 => E2 B4 A0 - E2 B4 A5
      #$E1:
      begin
-        c := InStr[1];
+        c2 := InStr[1];
-        c2 := InStr[2];
+        c3 := InStr[2];
-        if (c = #$82) and (c2 in [#$A0..#$BF]) then
+        if (c2 = #$82) and (c3 in [#$A0..#$BF]) then
        begin
          OutStr^ := #$E2;
          OutStr[1] := #$B4;
-          OutStr[2] := chr(ord(c2) - $20);
+          OutStr[2] := chr(ord(c3) - $20);
        end
-        else if (c = #$83) and (c2 in [#$80..#$85]) then
+        else if (c2 = #$83) and (c3 in [#$80..#$85]) then
        begin
          OutStr^ := #$E2;
          OutStr[1] := #$B4;
-          OutStr[2] := chr(ord(c2) + $20);
+          OutStr[2] := chr(ord(c3) + $20);
        end
        else
        begin
@ -1891,7 +1801,7 @@ begin
      end;
    else
      // Copy the character if the string was disaligned by previous changes
-      if (CounterDiff <> 0) then OutStr^:=c;
+      if (CounterDiff <> 0) then OutStr^:= c1;
      inc(InStr);
      inc(OutStr);
    end; // Case InStr^
@ -1901,16 +1811,11 @@ begin
  SetLength(Result,OutStr - PChar(Result));
 end;
 function UTF8UpperCase(const AInStr: utf8string): utf8string;
 begin
  Result := UTF8UpperCase(AInStr, '');
 end;
 {
  AInStr - The input string
  ALocale - The locale. Use '' for maximum speed if one desires to ignore the locale
 }
-function UTF8UpperCase(const AInStr, ALocale: utf8string): utf8string;
+function UTF8UpperCase(const AInStr: utf8string; ALocale: utf8string=''): utf8string;
 var
  i, InCounter, OutCounter: PtrInt;
  OutStr: PChar;
--- a/test/lazutils/testunicode.pas
+++ b/test/lazutils/testunicode.pas
@ -151,7 +151,7 @@ begin
  AssertStringOperationUTF8LowerCase('Offset Georgian UTF8LowerCase 2', 'tu', 'IႲⴒ Ⴓⴓ Ⴔⴔ Ⴕⴕ Ⴖⴖ Ⴗⴗ Ⴘⴘ Ⴙⴙ Ⴚⴚ Ⴛⴛ Ⴜⴜ Ⴝⴝ Ⴞⴞ Ⴟⴟ Ⴠⴠ Ⴡⴡ Ⴢⴢ Ⴣⴣ Ⴤⴤ Ⴥⴥ', 'ıⴒⴒ ⴓⴓ ⴔⴔ ⴕⴕ ⴖⴖ ⴗⴗ ⴘⴘ ⴙⴙ ⴚⴚ ⴛⴛ ⴜⴜ ⴝⴝ ⴞⴞ ⴟⴟ ⴠⴠ ⴡⴡ ⴢⴢ ⴣⴣ ⴤⴤ ⴥⴥ');
  // Performance test
-  Write('Mattias LowerCase- Performance test took:    ');
+{  Write('Mattias LowerCase- Performance test took:    ');
  for j := 0 to 9 do begin
    lStartTime := Now;
    for i := 0 to TimerLoop do
@ -170,7 +170,7 @@ begin
    lTimeDiff := Now - lStartTime;
    Write(Format(' %7d ms ', [DateTimeToMilliseconds(lTimeDiff)]));
  end;
-  writeln;
+  writeln;       }
  Write('       LowerCase-- Performance test took:    ');
  for j := 0 to 9 do begin
    lStartTime := Now;
@ -191,7 +191,7 @@ begin
    Write(Format(' %7d ms ', [DateTimeToMilliseconds(lTimeDiff)]));
  end;
  writeln;
-  Write('       LowerCase-- Performance test took:    ');
+  Write('       LowerCase2-- Performance test took:    ');
  for j := 0 to 9 do begin
    lStartTime := Now;
    for i := 0 to TimerLoop do
@ -211,7 +211,7 @@ begin
    Write(Format(' %7d ms ', [DateTimeToMilliseconds(lTimeDiff)]));
  end;
  writeln;
-  Write('       LowerCase-- Performance test took:    ');
+  Write('   Turk LowerCase-- Performance test took:    ');
  for j := 0 to 9 do begin
    lStartTime := Now;
    for i := 0 to TimerLoop do
@ -231,7 +231,7 @@ begin
    Write(Format(' %7d ms ', [DateTimeToMilliseconds(lTimeDiff)]));
  end;
  writeln;
-  Write('       LowerCase-- Performance test took:    ');
+  Write('   Turk LowerCase2-- Performance test took:    ');
  for j := 0 to 9 do begin
    lStartTime := Now;
    for i := 0 to TimerLoop do