mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-04-09 19:08:15 +02:00
+ "Utf8CodePointLen(pansichar,MaxLookAhead,IncludeCombiningDiacriticalMarks):
sizeint" function that returns: <0: invalid sequence detected after processing "-result" bytes 0: incomplete (may still be valid if MaxLookAhead is increased) >0: sequence of result bytes forms a codepoint (+ combining diacritical marks if that parameter was true) git-svn-id: trunk@30047 -
This commit is contained in:
parent
caa77e1f8d
commit
ff020a3be4
1
.gitattributes
vendored
1
.gitattributes
vendored
@ -12392,6 +12392,7 @@ tests/test/tunit3.pp svneol=native#text/plain
|
||||
tests/test/tunroll1.pp svneol=native#text/plain
|
||||
tests/test/tutf81.pp svneol=native#text/plain
|
||||
tests/test/tutf82.pp svneol=native#text/plain
|
||||
tests/test/tutf8cpl.pp svneol=native#text/plain
|
||||
tests/test/tvarpropsetter1.pp svneol=native#text/plain
|
||||
tests/test/tvarpropsetter2.pp svneol=native#text/plain
|
||||
tests/test/tvarset1.pp svneol=native#text/plain
|
||||
|
@ -1076,6 +1076,144 @@ function strpas(p:pchar):shortstring;{$ifdef SYSTEMINLINE}inline;{$endif}
|
||||
|
||||
{$endif not cpujvm}
|
||||
|
||||
|
||||
function Utf8CodePointLen(P: PAnsiChar; MaxLookAhead: SizeInt; IncludeCombiningDiacriticalMarks: Boolean): SizeInt;
|
||||
var
|
||||
bytes: sizeint;
|
||||
firstzerobit: byte;
|
||||
begin
|
||||
{ see https://en.wikipedia.org/wiki/UTF-8#Description for details }
|
||||
|
||||
if maxlookahead<=0 then
|
||||
begin
|
||||
{ incomplete }
|
||||
result:=0;
|
||||
exit;
|
||||
end;
|
||||
{ inlcude the first byte }
|
||||
result:=1;
|
||||
{ multiple byte utf-8 code point? }
|
||||
if p[0]>#127 then
|
||||
begin
|
||||
{ bsr searches for the leftmost 1 bit. We are interested in the
|
||||
leftmost 0 bit, so first invert the value
|
||||
}
|
||||
firstzerobit:=bsrbyte(not(byte(p[0])));
|
||||
{ if there is no zero bit or the first zero bit is the rightmost bit
|
||||
(bit 0), this is an invalid UTF-8 byte ($ff cannot appear in an
|
||||
UTF-8-encoded string, and in the worst case bit 1 has to be zero)
|
||||
Additionally, 5-byte UTF-8 sequences don't exist either, so bit 1
|
||||
cannot be the first zero-bit either. And bits 6 and 7 can't be 0
|
||||
either in the first byte.
|
||||
}
|
||||
if (firstzerobit<=1) or (firstzerobit>=6) then
|
||||
begin
|
||||
result:=-result;
|
||||
exit;
|
||||
end;
|
||||
{ the number of bytes belonging to this code point is
|
||||
7-(pos first 0-bit). Subtract 1 since we're already at the first
|
||||
byte. All subsequent bytes of the same sequence must have their
|
||||
highest bit set and the next one unset. We stop when we detect an
|
||||
invalid sequence.
|
||||
}
|
||||
bytes:=6-firstzerobit;
|
||||
while (result<maxlookahead) and
|
||||
(bytes>0) and
|
||||
((ord(p[result]) and %11000000)=%10000000) do
|
||||
begin
|
||||
inc(result);
|
||||
dec(bytes);
|
||||
end;
|
||||
{ stopped because of invalid/incomplete sequence -> exit }
|
||||
if bytes<>0 then
|
||||
begin
|
||||
if result>=maxlookahead then
|
||||
result:=0
|
||||
else
|
||||
result:=-result;
|
||||
exit;
|
||||
end;
|
||||
end;
|
||||
if includecombiningdiacriticalmarks then
|
||||
begin
|
||||
{ combining diacritical marks?
|
||||
1) U+0300 - U+036F in UTF-8 = %11001100 10000000 - %11001101 10101111
|
||||
2) U+1AB0 - U+1AFF in UTF-8 = %11100001 10101010 10110000 - %11100001 10101011 10111111
|
||||
3) U+1DC0 - U+1DFF in UTF-8 = %11100001 10110111 10000000 - %11100001 10110111 10111111
|
||||
4) U+20D0 - U+20FF in UTF-8 = %11100010 10000011 10010000 - %11100010 10000011 10111111
|
||||
5) U+FE20 - U+FE2F in UTF-8 = %11101111 10111000 10100000 - %11101111 10111000 10101111
|
||||
}
|
||||
repeat
|
||||
bytes:=result;
|
||||
if result+1<maxlookahead then
|
||||
begin
|
||||
{ case 1) }
|
||||
if ((ord(p[result]) and %11001100=%11001100)) and
|
||||
(ord(p[result+1])>=%10000000) and
|
||||
(ord(p[result+1])<=%10101111) then
|
||||
inc(result,2)
|
||||
{ case 2), 3), 4), 5) }
|
||||
else if (result+2<maxlookahead) and
|
||||
(ord(p[result])>=%11100001) then
|
||||
begin
|
||||
{ case 2) }
|
||||
if ((ord(p[result])=%11100001) and
|
||||
(ord(p[result+1])=%10101010) and
|
||||
(ord(p[result+2])>=%10110000) and
|
||||
(ord(p[result+2])<=%10111111)) or
|
||||
{ case 3) }
|
||||
((ord(p[result])=%11100001) and
|
||||
(ord(p[result+1])=%10110111) and
|
||||
(ord(p[result+2])>=%10000000) and
|
||||
(ord(p[result+2])<=%10111111)) or
|
||||
{ case 4) }
|
||||
((ord(p[result])=%11100010) and
|
||||
(ord(p[result+1])=%10000011) and
|
||||
(ord(p[result+2])>=%10010000) and
|
||||
(ord(p[result+2])<=%10111111)) or
|
||||
{ case 5) }
|
||||
((ord(p[result])=%11101111) and
|
||||
(ord(p[result+1])=%10111000) and
|
||||
(ord(p[result+2])>=%10100000) and
|
||||
(ord(p[result+2])<=%10101111)) then
|
||||
inc(result,3);
|
||||
end;
|
||||
end;
|
||||
until bytes=result;
|
||||
{ is there an incomplete diacritical mark? (invalid makes little sense:
|
||||
either a sequence is a combining diacritical mark, or it's not ; if
|
||||
it's invalid, it may also not have been a combining diacritical mark)
|
||||
}
|
||||
if result<maxlookahead then
|
||||
begin
|
||||
{ case 1) }
|
||||
if (((ord(p[result]) and %11001100=%11001100)) and
|
||||
(result+1>=maxlookahead)) or
|
||||
{ case 2) and 3)}
|
||||
((ord(p[result])=%11100001) and
|
||||
((result+1>=maxlookahead) or
|
||||
(((ord(p[result+1])=%10101010) or
|
||||
(ord(p[result+1])=%10110111)) and
|
||||
(result+2>=maxlookahead)))) or
|
||||
{ case 4 }
|
||||
((ord(p[result])=%11100010) and
|
||||
((result+1>=maxlookahead) or
|
||||
((ord(p[result+1])=%10000011) and
|
||||
(result+2>=maxlookahead)))) or
|
||||
{ case 5 }
|
||||
((ord(p[result])=%11101111) and
|
||||
((result+1>=maxlookahead) or
|
||||
((ord(p[result+1])=%10111000) and
|
||||
(result+2>=maxlookahead)))) then
|
||||
begin
|
||||
result:=0;
|
||||
exit;
|
||||
end;
|
||||
end;
|
||||
end;
|
||||
end;
|
||||
|
||||
{$ifndef FPC_SYSTEM_HAS_FPC_CHARARRAY_TO_SHORTSTR}
|
||||
|
||||
procedure fpc_chararray_to_shortstr(out res : shortstring;const arr: array of char; zerobased: boolean = true);[public,alias:'FPC_CHARARRAY_TO_SHORTSTR']; compilerproc;
|
||||
|
@ -1081,6 +1081,14 @@ Function Sseg:Word;{$ifdef SYSTEMINLINE}inline;{$endif}
|
||||
function strpas(p:pchar):shortstring;{$ifdef SYSTEMINLINE}inline;{$endif}
|
||||
function strlen(p:pchar):sizeint;external name 'FPC_PCHAR_LENGTH';
|
||||
|
||||
{ result:
|
||||
<0: invalid sequence detected after processing "-result" bytes
|
||||
0: incomplete (may still be valid if MaxLookAhead is increased)
|
||||
>0: sequence of result bytes forms a codepoint (+ combining diacritics if that
|
||||
parameter was true)
|
||||
}
|
||||
function Utf8CodePointLen(P: PAnsiChar; MaxLookAhead: SizeInt; IncludeCombiningDiacriticalMarks: Boolean): SizeInt;
|
||||
|
||||
{ Shortstring functions }
|
||||
Procedure Delete(var s:shortstring;index:SizeInt;count:SizeInt);
|
||||
Procedure Insert(const source:shortstring;var s:shortstring;index:SizeInt);
|
||||
|
@ -442,6 +442,15 @@ function strpas(p:pchar):shortstring;{$ifdef SYSTEMINLINE}inline;{$endif}
|
||||
function strlen(p:pchar):sizeint;external name 'FPC_PCHAR_LENGTH';
|
||||
*)
|
||||
|
||||
{ result:
|
||||
<0: invalid sequence detected after processing "-result" bytes
|
||||
0: incomplete (may still be valid if MaxLookAhead is increased)
|
||||
>0: sequence of result bytes forms a codepoint (+ combining diacritics if that
|
||||
parameter was true)
|
||||
}
|
||||
function Utf8CodePointLen(P: PAnsiChar; MaxLookAhead: SizeInt; IncludeCombiningDiacriticalMarks: Boolean): SizeInt;
|
||||
|
||||
|
||||
var
|
||||
{ separated compared to generic version, for Java type safety }
|
||||
FPC_EMPTYANSICHAR : array[0..0] of ansichar;
|
||||
|
148
tests/test/tutf8cpl.pp
Normal file
148
tests/test/tutf8cpl.pp
Normal file
@ -0,0 +1,148 @@
|
||||
{$mode objfpc}
|
||||
{$codepage utf8}
|
||||
|
||||
var
|
||||
name: utf8string;
|
||||
|
||||
procedure check(index, lookahead: longint; combiningdiacritics: boolean; expectedresult: longint; checknr: longint);
|
||||
begin
|
||||
if Utf8CodePointLen(pchar(@name[index]),lookahead,combiningdiacritics)<>expectedresult then
|
||||
begin
|
||||
writeln('check ',checknr,': Utf8CodePointLen(',copy(name,index,length(name)),',',lookahead,',',combiningdiacritics,') = ',Utf8CodePointLen(pchar(@name[index]),lookahead,false),' <> expected ',expectedresult);
|
||||
halt(1);
|
||||
end;
|
||||
end;
|
||||
|
||||
begin
|
||||
name:='a';
|
||||
check(1,0,false,0,1);
|
||||
check(1,1,false,1,2);
|
||||
check(1,1,true,1,3);
|
||||
check(1,6,false,1,4);
|
||||
check(1,6,true,1,5);
|
||||
name:='ab';
|
||||
check(1,6,false,1,6);
|
||||
check(1,6,true,1,7);
|
||||
check(1,1,false,1,8);
|
||||
check(1,1,true,1,9);
|
||||
check(2,6,false,1,10);
|
||||
check(2,6,true,1,11);
|
||||
name:='é';
|
||||
check(1,1,false,0,12);
|
||||
check(1,1,true,0,13);
|
||||
check(1,2,false,2,14);
|
||||
check(1,2,true,2,15);
|
||||
check(2,1,false,-1,16);
|
||||
check(2,1,true,-1,17);
|
||||
check(2,3,false,-1,18);
|
||||
check(2,3,true,-1,19);
|
||||
name:='éa';
|
||||
check(1,1,false,0,20);
|
||||
check(1,1,true,0,21);
|
||||
check(1,2,false,2,22);
|
||||
check(1,2,true,2,23);
|
||||
check(2,1,false,-1,24);
|
||||
check(2,1,true,-1,25);
|
||||
check(2,3,false,-1,26);
|
||||
check(2,3,true,-1,27);
|
||||
check(3,1,false,1,28);
|
||||
check(3,1,true,1,29);
|
||||
check(3,4,false,1,30);
|
||||
check(3,4,true,1,31);
|
||||
name[3]:=name[2];
|
||||
check(1,1,false,0,32);
|
||||
check(1,1,true,0,33);
|
||||
check(1,2,false,2,34);
|
||||
check(1,2,true,2,35);
|
||||
check(2,1,false,-1,36);
|
||||
check(2,1,true,-1,37);
|
||||
check(2,3,false,-1,38);
|
||||
check(2,3,true,-1,39);
|
||||
check(3,1,false,-1,40);
|
||||
check(3,1,true,-1,41);
|
||||
check(3,4,false,-1,42);
|
||||
check(3,4,true,-1,43);
|
||||
{ e + combining ` }
|
||||
name:='e'#$0300'b';
|
||||
{ check just the e without accent }
|
||||
check(1,1,false,1,44);
|
||||
check(1,1,true,1,45);
|
||||
check(1,2,false,1,46);
|
||||
{ partial diacritical mark }
|
||||
check(1,2,true,0,47);
|
||||
check(1,3,false,1,48);
|
||||
{ complete diacritical mark }
|
||||
check(1,3,true,3,49);
|
||||
check(1,4,false,1,50);
|
||||
{ complete diacritical mark (ignore extra character) }
|
||||
check(1,4,true,3,51);
|
||||
{ start of combining diacritical mark -- treated as independent utf-8 codepoint }
|
||||
check(2,1,false,0,52);
|
||||
check(2,1,true,0,53);
|
||||
check(2,3,false,2,54);
|
||||
check(2,3,true,2,55);
|
||||
{ middle of the combining diacritical mark }
|
||||
check(3,1,false,-1,56);
|
||||
check(3,1,true,-1,57);
|
||||
check(3,4,false,-1,58);
|
||||
check(3,4,true,-1,59);
|
||||
{ corrupt diacritical mark = no diacritical mark }
|
||||
name[3]:=name[4];
|
||||
{ partial diacritical mark (the corrupted byte is not included in the
|
||||
lookahead) }
|
||||
check(1,2,true,0,60);
|
||||
check(1,3,false,1,61);
|
||||
{ ignore corrupt diacritical mark }
|
||||
check(1,3,true,1,62);
|
||||
check(1,4,false,1,63);
|
||||
check(1,4,true,1,64);
|
||||
{ e + combining circle + combining superscript 'n' }
|
||||
name:='e'#$20DD#$1DE0'b';
|
||||
{ partial diacritical mark }
|
||||
check(1,2,true,0,65);
|
||||
check(1,3,false,1,66);
|
||||
check(1,3,true,0,67);
|
||||
check(1,4,false,1,68);
|
||||
{ complete diacritical mark }
|
||||
check(1,4,true,4,69);
|
||||
check(1,4,false,1,70);
|
||||
{ partial second diacritical mark }
|
||||
check(1,5,true,0,71);
|
||||
check(1,5,false,1,72);
|
||||
check(1,6,true,0,73);
|
||||
check(1,6,false,1,74);
|
||||
{ complete both diacritical marks }
|
||||
check(1,7,true,7,75);
|
||||
check(1,7,false,1,76);
|
||||
check(1,10,true,7,77);
|
||||
check(1,10,false,1,78);
|
||||
{ complete both diacritical marks without first character }
|
||||
check(2,6,true,6,79);
|
||||
check(2,20,true,6,80);
|
||||
{ only the first one, treated as independent codepoint }
|
||||
check(2,7,false,3,81);
|
||||
{ corrupt second diacritical mark }
|
||||
name[7]:=name[8];
|
||||
{ partial second diacritical mark }
|
||||
check(1,5,true,0,82);
|
||||
check(1,5,false,1,83);
|
||||
check(1,6,true,0,84);
|
||||
check(1,6,false,1,85);
|
||||
{ including bad byte -> ignore second diacritical mark completely
|
||||
(can't know it's part of a diacritical mark or something else) }
|
||||
check(1,7,true,4,86);
|
||||
check(1,7,false,1,87);
|
||||
{ complete both diacritical marks without first character,
|
||||
but with corrupted byte }
|
||||
check(2,7,true,3,88);
|
||||
check(2,20,true,3,89);
|
||||
{ corrupted diacritical mark by itself }
|
||||
{ 1) incomplete }
|
||||
check(5,1,false,0,90);
|
||||
check(5,1,true,0,91);
|
||||
check(5,2,false,0,92);
|
||||
check(5,2,true,0,93);
|
||||
{ 2) invalid }
|
||||
check(5,3,false,-2,94);
|
||||
check(5,3,true,-2,95);
|
||||
end.
|
Loading…
Reference in New Issue
Block a user