+ "Utf8CodePointLen(pansichar,MaxLookAhead,IncludeCombiningDiacriticalMarks):

sizeint" function that returns: <0: invalid sequence detected after processing "-result" bytes 0: incomplete (may still be valid if MaxLookAhead is increased) >0: sequence of result bytes forms a codepoint (+ combining diacritical marks if that parameter was true) git-svn-id: trunk@30047 -
2025-04-09 19:08:15 +02:00 · 2015-03-01 17:12:17 +00:00 · 2015-03-01 17:12:17 +00:00 · ff020a3be4
commit ff020a3be4
parent caa77e1f8d
5 changed files with 304 additions and 0 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -12392,6 +12392,7 @@ tests/test/tunit3.pp svneol=native#text/plain
 tests/test/tunroll1.pp svneol=native#text/plain
 tests/test/tutf81.pp svneol=native#text/plain
 tests/test/tutf82.pp svneol=native#text/plain
+tests/test/tutf8cpl.pp svneol=native#text/plain
 tests/test/tvarpropsetter1.pp svneol=native#text/plain
 tests/test/tvarpropsetter2.pp svneol=native#text/plain
 tests/test/tvarset1.pp svneol=native#text/plain
--- a/rtl/inc/generic.inc
+++ b/rtl/inc/generic.inc
@ -1076,6 +1076,144 @@ function strpas(p:pchar):shortstring;{$ifdef SYSTEMINLINE}inline;{$endif}

 {$endif not cpujvm}

+
+function Utf8CodePointLen(P: PAnsiChar; MaxLookAhead: SizeInt; IncludeCombiningDiacriticalMarks: Boolean): SizeInt;
+  var
+    bytes: sizeint;
+    firstzerobit: byte;
+  begin
+    { see https://en.wikipedia.org/wiki/UTF-8#Description for details }
+
+    if maxlookahead<=0 then
+      begin
+        { incomplete }
+        result:=0;
+        exit;
+      end;
+    { inlcude the first byte }
+    result:=1;
+    { multiple byte utf-8 code point? }
+    if p[0]>#127 then
+      begin
+        { bsr searches for the leftmost 1 bit. We are interested in the
+          leftmost 0 bit, so first invert the value
+        }
+        firstzerobit:=bsrbyte(not(byte(p[0])));
+        { if there is no zero bit or the first zero bit is the rightmost bit
+          (bit 0), this is an invalid UTF-8 byte ($ff cannot appear in an
+          UTF-8-encoded string, and in the worst case bit 1 has to be zero)
+          Additionally, 5-byte UTF-8 sequences don't exist either, so bit 1
+          cannot be the first zero-bit either. And bits 6 and 7 can't be 0
+          either in the first byte.
+        }
+        if (firstzerobit<=1) or (firstzerobit>=6)  then
+          begin
+            result:=-result;
+            exit;
+          end;
+        { the number of bytes belonging to this code point is
+          7-(pos first 0-bit). Subtract 1 since we're already at the first
+          byte. All subsequent bytes of the same sequence must have their
+          highest bit set and the next one unset. We stop when we detect an
+          invalid sequence.
+        }
+        bytes:=6-firstzerobit;
+        while (result<maxlookahead) and
+              (bytes>0) and
+              ((ord(p[result]) and %11000000)=%10000000) do
+          begin
+            inc(result);
+            dec(bytes);
+          end;
+        { stopped because of invalid/incomplete sequence -> exit }
+        if bytes<>0 then
+          begin
+            if result>=maxlookahead then
+              result:=0
+            else
+              result:=-result;
+            exit;
+          end;
+      end;
+    if includecombiningdiacriticalmarks then
+      begin
+        { combining diacritical marks?
+            1) U+0300 - U+036F in UTF-8 = %11001100 10000000 - %11001101 10101111
+            2) U+1AB0 - U+1AFF in UTF-8 = %11100001 10101010 10110000 - %11100001 10101011 10111111
+            3) U+1DC0 - U+1DFF in UTF-8 = %11100001 10110111 10000000 - %11100001 10110111 10111111
+            4) U+20D0 - U+20FF in UTF-8 = %11100010 10000011 10010000 - %11100010 10000011 10111111
+            5) U+FE20 - U+FE2F in UTF-8 = %11101111 10111000 10100000 - %11101111 10111000 10101111
+        }
+        repeat
+          bytes:=result;
+          if result+1<maxlookahead then
+            begin
+              { case 1) }
+              if ((ord(p[result]) and %11001100=%11001100)) and
+                  (ord(p[result+1])>=%10000000) and
+                  (ord(p[result+1])<=%10101111) then
+                inc(result,2)
+                  { case 2), 3), 4), 5) }
+              else if (result+2<maxlookahead) and
+                 (ord(p[result])>=%11100001) then
+                begin
+                     { case 2) }
+                  if ((ord(p[result])=%11100001) and
+                      (ord(p[result+1])=%10101010) and
+                      (ord(p[result+2])>=%10110000) and
+                      (ord(p[result+2])<=%10111111)) or
+                     { case 3) }
+                     ((ord(p[result])=%11100001) and
+                      (ord(p[result+1])=%10110111) and
+                      (ord(p[result+2])>=%10000000) and
+                      (ord(p[result+2])<=%10111111)) or
+                     { case 4) }
+                     ((ord(p[result])=%11100010) and
+                      (ord(p[result+1])=%10000011) and
+                      (ord(p[result+2])>=%10010000) and
+                      (ord(p[result+2])<=%10111111)) or
+                     { case 5) }
+                     ((ord(p[result])=%11101111) and
+                      (ord(p[result+1])=%10111000) and
+                      (ord(p[result+2])>=%10100000) and
+                      (ord(p[result+2])<=%10101111)) then
+                    inc(result,3);
+                end;
+            end;
+        until bytes=result;
+        { is there an incomplete diacritical mark? (invalid makes little sense:
+          either a sequence is a combining diacritical mark, or it's not ; if
+          it's invalid, it may also not have been a combining diacritical mark)
+        }
+        if result<maxlookahead then
+          begin
+               { case 1) }
+            if (((ord(p[result]) and %11001100=%11001100)) and
+                (result+1>=maxlookahead)) or
+               { case 2) and 3)}
+               ((ord(p[result])=%11100001) and
+                ((result+1>=maxlookahead) or
+                 (((ord(p[result+1])=%10101010) or
+                   (ord(p[result+1])=%10110111)) and
+                  (result+2>=maxlookahead)))) or
+               { case 4 }
+               ((ord(p[result])=%11100010) and
+                ((result+1>=maxlookahead) or
+                 ((ord(p[result+1])=%10000011) and
+                  (result+2>=maxlookahead)))) or
+               { case 5 }
+               ((ord(p[result])=%11101111) and
+                ((result+1>=maxlookahead) or
+                 ((ord(p[result+1])=%10111000) and
+                  (result+2>=maxlookahead)))) then
+              begin
+                result:=0;
+                exit;
+              end;
+          end;
+      end;
+  end;
+
 {$ifndef FPC_SYSTEM_HAS_FPC_CHARARRAY_TO_SHORTSTR}

 procedure fpc_chararray_to_shortstr(out res : shortstring;const arr: array of char; zerobased: boolean = true);[public,alias:'FPC_CHARARRAY_TO_SHORTSTR']; compilerproc;
--- a/rtl/inc/systemh.inc
+++ b/rtl/inc/systemh.inc
@ -1081,6 +1081,14 @@ Function  Sseg:Word;{$ifdef SYSTEMINLINE}inline;{$endif}
 function strpas(p:pchar):shortstring;{$ifdef SYSTEMINLINE}inline;{$endif}
 function strlen(p:pchar):sizeint;external name 'FPC_PCHAR_LENGTH';

+{ result:
+  <0: invalid sequence detected after processing "-result" bytes
+  0: incomplete (may still be valid if MaxLookAhead is increased)
+  >0: sequence of result bytes forms a codepoint (+ combining diacritics if that
+      parameter was true)
+}
+function Utf8CodePointLen(P: PAnsiChar; MaxLookAhead: SizeInt; IncludeCombiningDiacriticalMarks: Boolean): SizeInt;
+
 { Shortstring functions }
 Procedure Delete(var s:shortstring;index:SizeInt;count:SizeInt);
 Procedure Insert(const source:shortstring;var s:shortstring;index:SizeInt);
--- a/rtl/java/jsystemh.inc
+++ b/rtl/java/jsystemh.inc
@ -442,6 +442,15 @@ function strpas(p:pchar):shortstring;{$ifdef SYSTEMINLINE}inline;{$endif}
 function strlen(p:pchar):sizeint;external name 'FPC_PCHAR_LENGTH';
 *)

+{ result:
+  <0: invalid sequence detected after processing "-result" bytes
+  0: incomplete (may still be valid if MaxLookAhead is increased)
+  >0: sequence of result bytes forms a codepoint (+ combining diacritics if that
+      parameter was true)
+}
+function Utf8CodePointLen(P: PAnsiChar; MaxLookAhead: SizeInt; IncludeCombiningDiacriticalMarks: Boolean): SizeInt;
+
+
 var
  { separated compared to generic version, for Java type safety }
  FPC_EMPTYANSICHAR : array[0..0] of ansichar;
--- a/tests/test/tutf8cpl.pp
+++ b/tests/test/tutf8cpl.pp
@ -0,0 +1,148 @@
+{$mode objfpc}
+{$codepage utf8}
+
+var
+  name: utf8string;
+  
+procedure check(index, lookahead: longint; combiningdiacritics: boolean; expectedresult: longint; checknr: longint);
+begin
+  if Utf8CodePointLen(pchar(@name[index]),lookahead,combiningdiacritics)<>expectedresult then
+    begin
+      writeln('check ',checknr,': Utf8CodePointLen(',copy(name,index,length(name)),',',lookahead,',',combiningdiacritics,') = ',Utf8CodePointLen(pchar(@name[index]),lookahead,false),' <> expected ',expectedresult);
+      halt(1);
+    end;
+end;
+
+begin
+  name:='a';
+  check(1,0,false,0,1);
+  check(1,1,false,1,2);
+  check(1,1,true,1,3);
+  check(1,6,false,1,4);
+  check(1,6,true,1,5);
+  name:='ab';
+  check(1,6,false,1,6);
+  check(1,6,true,1,7);
+  check(1,1,false,1,8);
+  check(1,1,true,1,9);
+  check(2,6,false,1,10);
+  check(2,6,true,1,11);
+  name:='é';
+  check(1,1,false,0,12);
+  check(1,1,true,0,13);
+  check(1,2,false,2,14);
+  check(1,2,true,2,15);
+  check(2,1,false,-1,16);
+  check(2,1,true,-1,17);
+  check(2,3,false,-1,18);
+  check(2,3,true,-1,19);
+  name:='éa';
+  check(1,1,false,0,20);
+  check(1,1,true,0,21);
+  check(1,2,false,2,22);
+  check(1,2,true,2,23);
+  check(2,1,false,-1,24);
+  check(2,1,true,-1,25);
+  check(2,3,false,-1,26);
+  check(2,3,true,-1,27);
+  check(3,1,false,1,28);
+  check(3,1,true,1,29);
+  check(3,4,false,1,30);
+  check(3,4,true,1,31);
+  name[3]:=name[2];
+  check(1,1,false,0,32);
+  check(1,1,true,0,33);
+  check(1,2,false,2,34);
+  check(1,2,true,2,35);
+  check(2,1,false,-1,36);
+  check(2,1,true,-1,37);
+  check(2,3,false,-1,38);
+  check(2,3,true,-1,39);
+  check(3,1,false,-1,40);
+  check(3,1,true,-1,41);
+  check(3,4,false,-1,42);
+  check(3,4,true,-1,43);
+  { e + combining ` }
+  name:='e'#$0300'b';
+  { check just the e without accent }
+  check(1,1,false,1,44);
+  check(1,1,true,1,45);
+  check(1,2,false,1,46);
+  { partial diacritical mark }
+  check(1,2,true,0,47);
+  check(1,3,false,1,48);
+  { complete diacritical mark }
+  check(1,3,true,3,49);
+  check(1,4,false,1,50);
+  { complete diacritical mark (ignore extra character) }
+  check(1,4,true,3,51);
+  { start of combining diacritical mark -- treated as independent utf-8 codepoint }
+  check(2,1,false,0,52);
+  check(2,1,true,0,53);
+  check(2,3,false,2,54);
+  check(2,3,true,2,55);
+  { middle of the combining diacritical mark }
+  check(3,1,false,-1,56);
+  check(3,1,true,-1,57);
+  check(3,4,false,-1,58);
+  check(3,4,true,-1,59);
+  { corrupt diacritical mark = no diacritical mark }
+  name[3]:=name[4];
+  { partial diacritical mark (the corrupted byte is not included in the
+    lookahead) }
+  check(1,2,true,0,60);
+  check(1,3,false,1,61);
+  { ignore corrupt diacritical mark }
+  check(1,3,true,1,62);
+  check(1,4,false,1,63);
+  check(1,4,true,1,64);
+  { e + combining circle + combining superscript 'n' }
+  name:='e'#$20DD#$1DE0'b';
+  { partial diacritical mark }
+  check(1,2,true,0,65);
+  check(1,3,false,1,66);
+  check(1,3,true,0,67);
+  check(1,4,false,1,68);
+  { complete diacritical mark }
+  check(1,4,true,4,69);
+  check(1,4,false,1,70);
+  { partial second diacritical mark }
+  check(1,5,true,0,71);
+  check(1,5,false,1,72);
+  check(1,6,true,0,73);
+  check(1,6,false,1,74);
+  { complete both diacritical marks }
+  check(1,7,true,7,75);
+  check(1,7,false,1,76);
+  check(1,10,true,7,77);
+  check(1,10,false,1,78);
+  { complete both diacritical marks without first character }
+  check(2,6,true,6,79);
+  check(2,20,true,6,80);
+  { only the first one, treated as independent codepoint }
+  check(2,7,false,3,81);
+  { corrupt second diacritical mark }
+  name[7]:=name[8];
+  { partial second diacritical mark }
+  check(1,5,true,0,82);
+  check(1,5,false,1,83);
+  check(1,6,true,0,84);
+  check(1,6,false,1,85);
+  { including bad byte -> ignore second diacritical mark completely
+    (can't know it's part of a diacritical mark or something else) }
+  check(1,7,true,4,86);
+  check(1,7,false,1,87);
+  { complete both diacritical marks without first character,
+    but with corrupted byte }
+  check(2,7,true,3,88);
+  check(2,20,true,3,89);
+  { corrupted diacritical mark by itself }
+  { 1) incomplete }
+  check(5,1,false,0,90);
+  check(5,1,true,0,91);
+  check(5,2,false,0,92);
+  check(5,2,true,0,93);
+  { 2) invalid }
+  check(5,3,false,-2,94);
+  check(5,3,true,-2,95);
+end.