+ "Utf8CodePointLen(pansichar,MaxLookAhead,IncludeCombiningDiacriticalMarks):

sizeint"  function that returns:
   <0: invalid sequence detected after processing "-result" bytes
   0:  incomplete (may still be valid if MaxLookAhead is increased)
   >0: sequence of result bytes forms a codepoint (+ combining diacritical
       marks if that parameter was true)

git-svn-id: trunk@30047 -
This commit is contained in:
Jonas Maebe 2015-03-01 17:12:17 +00:00
parent caa77e1f8d
commit ff020a3be4
5 changed files with 304 additions and 0 deletions

1
.gitattributes vendored
View File

@ -12392,6 +12392,7 @@ tests/test/tunit3.pp svneol=native#text/plain
tests/test/tunroll1.pp svneol=native#text/plain
tests/test/tutf81.pp svneol=native#text/plain
tests/test/tutf82.pp svneol=native#text/plain
tests/test/tutf8cpl.pp svneol=native#text/plain
tests/test/tvarpropsetter1.pp svneol=native#text/plain
tests/test/tvarpropsetter2.pp svneol=native#text/plain
tests/test/tvarset1.pp svneol=native#text/plain

View File

@ -1076,6 +1076,144 @@ function strpas(p:pchar):shortstring;{$ifdef SYSTEMINLINE}inline;{$endif}
{$endif not cpujvm}
function Utf8CodePointLen(P: PAnsiChar; MaxLookAhead: SizeInt; IncludeCombiningDiacriticalMarks: Boolean): SizeInt;
var
bytes: sizeint;
firstzerobit: byte;
begin
{ see https://en.wikipedia.org/wiki/UTF-8#Description for details }
if maxlookahead<=0 then
begin
{ incomplete }
result:=0;
exit;
end;
{ inlcude the first byte }
result:=1;
{ multiple byte utf-8 code point? }
if p[0]>#127 then
begin
{ bsr searches for the leftmost 1 bit. We are interested in the
leftmost 0 bit, so first invert the value
}
firstzerobit:=bsrbyte(not(byte(p[0])));
{ if there is no zero bit or the first zero bit is the rightmost bit
(bit 0), this is an invalid UTF-8 byte ($ff cannot appear in an
UTF-8-encoded string, and in the worst case bit 1 has to be zero)
Additionally, 5-byte UTF-8 sequences don't exist either, so bit 1
cannot be the first zero-bit either. And bits 6 and 7 can't be 0
either in the first byte.
}
if (firstzerobit<=1) or (firstzerobit>=6) then
begin
result:=-result;
exit;
end;
{ the number of bytes belonging to this code point is
7-(pos first 0-bit). Subtract 1 since we're already at the first
byte. All subsequent bytes of the same sequence must have their
highest bit set and the next one unset. We stop when we detect an
invalid sequence.
}
bytes:=6-firstzerobit;
while (result<maxlookahead) and
(bytes>0) and
((ord(p[result]) and %11000000)=%10000000) do
begin
inc(result);
dec(bytes);
end;
{ stopped because of invalid/incomplete sequence -> exit }
if bytes<>0 then
begin
if result>=maxlookahead then
result:=0
else
result:=-result;
exit;
end;
end;
if includecombiningdiacriticalmarks then
begin
{ combining diacritical marks?
1) U+0300 - U+036F in UTF-8 = %11001100 10000000 - %11001101 10101111
2) U+1AB0 - U+1AFF in UTF-8 = %11100001 10101010 10110000 - %11100001 10101011 10111111
3) U+1DC0 - U+1DFF in UTF-8 = %11100001 10110111 10000000 - %11100001 10110111 10111111
4) U+20D0 - U+20FF in UTF-8 = %11100010 10000011 10010000 - %11100010 10000011 10111111
5) U+FE20 - U+FE2F in UTF-8 = %11101111 10111000 10100000 - %11101111 10111000 10101111
}
repeat
bytes:=result;
if result+1<maxlookahead then
begin
{ case 1) }
if ((ord(p[result]) and %11001100=%11001100)) and
(ord(p[result+1])>=%10000000) and
(ord(p[result+1])<=%10101111) then
inc(result,2)
{ case 2), 3), 4), 5) }
else if (result+2<maxlookahead) and
(ord(p[result])>=%11100001) then
begin
{ case 2) }
if ((ord(p[result])=%11100001) and
(ord(p[result+1])=%10101010) and
(ord(p[result+2])>=%10110000) and
(ord(p[result+2])<=%10111111)) or
{ case 3) }
((ord(p[result])=%11100001) and
(ord(p[result+1])=%10110111) and
(ord(p[result+2])>=%10000000) and
(ord(p[result+2])<=%10111111)) or
{ case 4) }
((ord(p[result])=%11100010) and
(ord(p[result+1])=%10000011) and
(ord(p[result+2])>=%10010000) and
(ord(p[result+2])<=%10111111)) or
{ case 5) }
((ord(p[result])=%11101111) and
(ord(p[result+1])=%10111000) and
(ord(p[result+2])>=%10100000) and
(ord(p[result+2])<=%10101111)) then
inc(result,3);
end;
end;
until bytes=result;
{ is there an incomplete diacritical mark? (invalid makes little sense:
either a sequence is a combining diacritical mark, or it's not ; if
it's invalid, it may also not have been a combining diacritical mark)
}
if result<maxlookahead then
begin
{ case 1) }
if (((ord(p[result]) and %11001100=%11001100)) and
(result+1>=maxlookahead)) or
{ case 2) and 3)}
((ord(p[result])=%11100001) and
((result+1>=maxlookahead) or
(((ord(p[result+1])=%10101010) or
(ord(p[result+1])=%10110111)) and
(result+2>=maxlookahead)))) or
{ case 4 }
((ord(p[result])=%11100010) and
((result+1>=maxlookahead) or
((ord(p[result+1])=%10000011) and
(result+2>=maxlookahead)))) or
{ case 5 }
((ord(p[result])=%11101111) and
((result+1>=maxlookahead) or
((ord(p[result+1])=%10111000) and
(result+2>=maxlookahead)))) then
begin
result:=0;
exit;
end;
end;
end;
end;
{$ifndef FPC_SYSTEM_HAS_FPC_CHARARRAY_TO_SHORTSTR}
procedure fpc_chararray_to_shortstr(out res : shortstring;const arr: array of char; zerobased: boolean = true);[public,alias:'FPC_CHARARRAY_TO_SHORTSTR']; compilerproc;

View File

@ -1081,6 +1081,14 @@ Function Sseg:Word;{$ifdef SYSTEMINLINE}inline;{$endif}
function strpas(p:pchar):shortstring;{$ifdef SYSTEMINLINE}inline;{$endif}
function strlen(p:pchar):sizeint;external name 'FPC_PCHAR_LENGTH';
{ result:
<0: invalid sequence detected after processing "-result" bytes
0: incomplete (may still be valid if MaxLookAhead is increased)
>0: sequence of result bytes forms a codepoint (+ combining diacritics if that
parameter was true)
}
function Utf8CodePointLen(P: PAnsiChar; MaxLookAhead: SizeInt; IncludeCombiningDiacriticalMarks: Boolean): SizeInt;
{ Shortstring functions }
Procedure Delete(var s:shortstring;index:SizeInt;count:SizeInt);
Procedure Insert(const source:shortstring;var s:shortstring;index:SizeInt);

View File

@ -442,6 +442,15 @@ function strpas(p:pchar):shortstring;{$ifdef SYSTEMINLINE}inline;{$endif}
function strlen(p:pchar):sizeint;external name 'FPC_PCHAR_LENGTH';
*)
{ result:
<0: invalid sequence detected after processing "-result" bytes
0: incomplete (may still be valid if MaxLookAhead is increased)
>0: sequence of result bytes forms a codepoint (+ combining diacritics if that
parameter was true)
}
function Utf8CodePointLen(P: PAnsiChar; MaxLookAhead: SizeInt; IncludeCombiningDiacriticalMarks: Boolean): SizeInt;
var
{ separated compared to generic version, for Java type safety }
FPC_EMPTYANSICHAR : array[0..0] of ansichar;

148
tests/test/tutf8cpl.pp Normal file
View File

@ -0,0 +1,148 @@
{$mode objfpc}
{$codepage utf8}
var
name: utf8string;
procedure check(index, lookahead: longint; combiningdiacritics: boolean; expectedresult: longint; checknr: longint);
begin
if Utf8CodePointLen(pchar(@name[index]),lookahead,combiningdiacritics)<>expectedresult then
begin
writeln('check ',checknr,': Utf8CodePointLen(',copy(name,index,length(name)),',',lookahead,',',combiningdiacritics,') = ',Utf8CodePointLen(pchar(@name[index]),lookahead,false),' <> expected ',expectedresult);
halt(1);
end;
end;
begin
name:='a';
check(1,0,false,0,1);
check(1,1,false,1,2);
check(1,1,true,1,3);
check(1,6,false,1,4);
check(1,6,true,1,5);
name:='ab';
check(1,6,false,1,6);
check(1,6,true,1,7);
check(1,1,false,1,8);
check(1,1,true,1,9);
check(2,6,false,1,10);
check(2,6,true,1,11);
name:='é';
check(1,1,false,0,12);
check(1,1,true,0,13);
check(1,2,false,2,14);
check(1,2,true,2,15);
check(2,1,false,-1,16);
check(2,1,true,-1,17);
check(2,3,false,-1,18);
check(2,3,true,-1,19);
name:='éa';
check(1,1,false,0,20);
check(1,1,true,0,21);
check(1,2,false,2,22);
check(1,2,true,2,23);
check(2,1,false,-1,24);
check(2,1,true,-1,25);
check(2,3,false,-1,26);
check(2,3,true,-1,27);
check(3,1,false,1,28);
check(3,1,true,1,29);
check(3,4,false,1,30);
check(3,4,true,1,31);
name[3]:=name[2];
check(1,1,false,0,32);
check(1,1,true,0,33);
check(1,2,false,2,34);
check(1,2,true,2,35);
check(2,1,false,-1,36);
check(2,1,true,-1,37);
check(2,3,false,-1,38);
check(2,3,true,-1,39);
check(3,1,false,-1,40);
check(3,1,true,-1,41);
check(3,4,false,-1,42);
check(3,4,true,-1,43);
{ e + combining ` }
name:='e'#$0300'b';
{ check just the e without accent }
check(1,1,false,1,44);
check(1,1,true,1,45);
check(1,2,false,1,46);
{ partial diacritical mark }
check(1,2,true,0,47);
check(1,3,false,1,48);
{ complete diacritical mark }
check(1,3,true,3,49);
check(1,4,false,1,50);
{ complete diacritical mark (ignore extra character) }
check(1,4,true,3,51);
{ start of combining diacritical mark -- treated as independent utf-8 codepoint }
check(2,1,false,0,52);
check(2,1,true,0,53);
check(2,3,false,2,54);
check(2,3,true,2,55);
{ middle of the combining diacritical mark }
check(3,1,false,-1,56);
check(3,1,true,-1,57);
check(3,4,false,-1,58);
check(3,4,true,-1,59);
{ corrupt diacritical mark = no diacritical mark }
name[3]:=name[4];
{ partial diacritical mark (the corrupted byte is not included in the
lookahead) }
check(1,2,true,0,60);
check(1,3,false,1,61);
{ ignore corrupt diacritical mark }
check(1,3,true,1,62);
check(1,4,false,1,63);
check(1,4,true,1,64);
{ e + combining circle + combining superscript 'n' }
name:='e'#$20DD#$1DE0'b';
{ partial diacritical mark }
check(1,2,true,0,65);
check(1,3,false,1,66);
check(1,3,true,0,67);
check(1,4,false,1,68);
{ complete diacritical mark }
check(1,4,true,4,69);
check(1,4,false,1,70);
{ partial second diacritical mark }
check(1,5,true,0,71);
check(1,5,false,1,72);
check(1,6,true,0,73);
check(1,6,false,1,74);
{ complete both diacritical marks }
check(1,7,true,7,75);
check(1,7,false,1,76);
check(1,10,true,7,77);
check(1,10,false,1,78);
{ complete both diacritical marks without first character }
check(2,6,true,6,79);
check(2,20,true,6,80);
{ only the first one, treated as independent codepoint }
check(2,7,false,3,81);
{ corrupt second diacritical mark }
name[7]:=name[8];
{ partial second diacritical mark }
check(1,5,true,0,82);
check(1,5,false,1,83);
check(1,6,true,0,84);
check(1,6,false,1,85);
{ including bad byte -> ignore second diacritical mark completely
(can't know it's part of a diacritical mark or something else) }
check(1,7,true,4,86);
check(1,7,false,1,87);
{ complete both diacritical marks without first character,
but with corrupted byte }
check(2,7,true,3,88);
check(2,20,true,3,89);
{ corrupted diacritical mark by itself }
{ 1) incomplete }
check(5,1,false,0,90);
check(5,1,true,0,91);
check(5,2,false,0,92);
check(5,2,true,0,93);
{ 2) invalid }
check(5,3,false,-2,94);
check(5,3,true,-2,95);
end.