diff --git a/.gitattributes b/.gitattributes index 2bf50260cc..17536e105b 100644 --- a/.gitattributes +++ b/.gitattributes @@ -12317,6 +12317,7 @@ tests/test/tunit3.pp svneol=native#text/plain tests/test/tunroll1.pp svneol=native#text/plain tests/test/tutf81.pp svneol=native#text/plain tests/test/tutf82.pp svneol=native#text/plain +tests/test/tutf8cpl.pp svneol=native#text/plain tests/test/tvarpropsetter1.pp svneol=native#text/plain tests/test/tvarpropsetter2.pp svneol=native#text/plain tests/test/tvarset1.pp svneol=native#text/plain diff --git a/rtl/inc/generic.inc b/rtl/inc/generic.inc index 4774d639e4..330082f038 100644 --- a/rtl/inc/generic.inc +++ b/rtl/inc/generic.inc @@ -1076,6 +1076,144 @@ function strpas(p:pchar):shortstring;{$ifdef SYSTEMINLINE}inline;{$endif} {$endif not cpujvm} + +function Utf8CodePointLen(P: PAnsiChar; MaxLookAhead: SizeInt; IncludeCombiningDiacriticalMarks: Boolean): SizeInt; + var + bytes: sizeint; + firstzerobit: byte; + begin + { see https://en.wikipedia.org/wiki/UTF-8#Description for details } + + if maxlookahead<=0 then + begin + { incomplete } + result:=0; + exit; + end; + { inlcude the first byte } + result:=1; + { multiple byte utf-8 code point? } + if p[0]>#127 then + begin + { bsr searches for the leftmost 1 bit. We are interested in the + leftmost 0 bit, so first invert the value + } + firstzerobit:=bsrbyte(not(byte(p[0]))); + { if there is no zero bit or the first zero bit is the rightmost bit + (bit 0), this is an invalid UTF-8 byte ($ff cannot appear in an + UTF-8-encoded string, and in the worst case bit 1 has to be zero) + Additionally, 5-byte UTF-8 sequences don't exist either, so bit 1 + cannot be the first zero-bit either. And bits 6 and 7 can't be 0 + either in the first byte. + } + if (firstzerobit<=1) or (firstzerobit>=6) then + begin + result:=-result; + exit; + end; + { the number of bytes belonging to this code point is + 7-(pos first 0-bit). Subtract 1 since we're already at the first + byte. All subsequent bytes of the same sequence must have their + highest bit set and the next one unset. We stop when we detect an + invalid sequence. + } + bytes:=6-firstzerobit; + while (result0) and + ((ord(p[result]) and %11000000)=%10000000) do + begin + inc(result); + dec(bytes); + end; + { stopped because of invalid/incomplete sequence -> exit } + if bytes<>0 then + begin + if result>=maxlookahead then + result:=0 + else + result:=-result; + exit; + end; + end; + if includecombiningdiacriticalmarks then + begin + { combining diacritical marks? + 1) U+0300 - U+036F in UTF-8 = %11001100 10000000 - %11001101 10101111 + 2) U+1AB0 - U+1AFF in UTF-8 = %11100001 10101010 10110000 - %11100001 10101011 10111111 + 3) U+1DC0 - U+1DFF in UTF-8 = %11100001 10110111 10000000 - %11100001 10110111 10111111 + 4) U+20D0 - U+20FF in UTF-8 = %11100010 10000011 10010000 - %11100010 10000011 10111111 + 5) U+FE20 - U+FE2F in UTF-8 = %11101111 10111000 10100000 - %11101111 10111000 10101111 + } + repeat + bytes:=result; + if result+1=%10000000) and + (ord(p[result+1])<=%10101111) then + inc(result,2) + { case 2), 3), 4), 5) } + else if (result+2=%11100001) then + begin + { case 2) } + if ((ord(p[result])=%11100001) and + (ord(p[result+1])=%10101010) and + (ord(p[result+2])>=%10110000) and + (ord(p[result+2])<=%10111111)) or + { case 3) } + ((ord(p[result])=%11100001) and + (ord(p[result+1])=%10110111) and + (ord(p[result+2])>=%10000000) and + (ord(p[result+2])<=%10111111)) or + { case 4) } + ((ord(p[result])=%11100010) and + (ord(p[result+1])=%10000011) and + (ord(p[result+2])>=%10010000) and + (ord(p[result+2])<=%10111111)) or + { case 5) } + ((ord(p[result])=%11101111) and + (ord(p[result+1])=%10111000) and + (ord(p[result+2])>=%10100000) and + (ord(p[result+2])<=%10101111)) then + inc(result,3); + end; + end; + until bytes=result; + { is there an incomplete diacritical mark? (invalid makes little sense: + either a sequence is a combining diacritical mark, or it's not ; if + it's invalid, it may also not have been a combining diacritical mark) + } + if result=maxlookahead)) or + { case 2) and 3)} + ((ord(p[result])=%11100001) and + ((result+1>=maxlookahead) or + (((ord(p[result+1])=%10101010) or + (ord(p[result+1])=%10110111)) and + (result+2>=maxlookahead)))) or + { case 4 } + ((ord(p[result])=%11100010) and + ((result+1>=maxlookahead) or + ((ord(p[result+1])=%10000011) and + (result+2>=maxlookahead)))) or + { case 5 } + ((ord(p[result])=%11101111) and + ((result+1>=maxlookahead) or + ((ord(p[result+1])=%10111000) and + (result+2>=maxlookahead)))) then + begin + result:=0; + exit; + end; + end; + end; + end; + {$ifndef FPC_SYSTEM_HAS_FPC_CHARARRAY_TO_SHORTSTR} procedure fpc_chararray_to_shortstr(out res : shortstring;const arr: array of char; zerobased: boolean = true);[public,alias:'FPC_CHARARRAY_TO_SHORTSTR']; compilerproc; diff --git a/rtl/inc/systemh.inc b/rtl/inc/systemh.inc index 69b489ad9f..72b3a858f6 100644 --- a/rtl/inc/systemh.inc +++ b/rtl/inc/systemh.inc @@ -1065,6 +1065,14 @@ Function Sseg:Word;{$ifdef SYSTEMINLINE}inline;{$endif} function strpas(p:pchar):shortstring;{$ifdef SYSTEMINLINE}inline;{$endif} function strlen(p:pchar):sizeint;external name 'FPC_PCHAR_LENGTH'; +{ result: + <0: invalid sequence detected after processing "-result" bytes + 0: incomplete (may still be valid if MaxLookAhead is increased) + >0: sequence of result bytes forms a codepoint (+ combining diacritics if that + parameter was true) +} +function Utf8CodePointLen(P: PAnsiChar; MaxLookAhead: SizeInt; IncludeCombiningDiacriticalMarks: Boolean): SizeInt; + { Shortstring functions } Procedure Delete(var s:shortstring;index:SizeInt;count:SizeInt); Procedure Insert(const source:shortstring;var s:shortstring;index:SizeInt); diff --git a/rtl/java/jsystemh.inc b/rtl/java/jsystemh.inc index af0776f196..76047d74e1 100644 --- a/rtl/java/jsystemh.inc +++ b/rtl/java/jsystemh.inc @@ -442,6 +442,15 @@ function strpas(p:pchar):shortstring;{$ifdef SYSTEMINLINE}inline;{$endif} function strlen(p:pchar):sizeint;external name 'FPC_PCHAR_LENGTH'; *) +{ result: + <0: invalid sequence detected after processing "-result" bytes + 0: incomplete (may still be valid if MaxLookAhead is increased) + >0: sequence of result bytes forms a codepoint (+ combining diacritics if that + parameter was true) +} +function Utf8CodePointLen(P: PAnsiChar; MaxLookAhead: SizeInt; IncludeCombiningDiacriticalMarks: Boolean): SizeInt; + + var { separated compared to generic version, for Java type safety } FPC_EMPTYANSICHAR : array[0..0] of ansichar; diff --git a/tests/test/tutf8cpl.pp b/tests/test/tutf8cpl.pp new file mode 100644 index 0000000000..6a6e7ee821 --- /dev/null +++ b/tests/test/tutf8cpl.pp @@ -0,0 +1,148 @@ +{$mode objfpc} +{$codepage utf8} + +var + name: utf8string; + +procedure check(index, lookahead: longint; combiningdiacritics: boolean; expectedresult: longint; checknr: longint); +begin + if Utf8CodePointLen(pchar(@name[index]),lookahead,combiningdiacritics)<>expectedresult then + begin + writeln('check ',checknr,': Utf8CodePointLen(',copy(name,index,length(name)),',',lookahead,',',combiningdiacritics,') = ',Utf8CodePointLen(pchar(@name[index]),lookahead,false),' <> expected ',expectedresult); + halt(1); + end; +end; + +begin + name:='a'; + check(1,0,false,0,1); + check(1,1,false,1,2); + check(1,1,true,1,3); + check(1,6,false,1,4); + check(1,6,true,1,5); + name:='ab'; + check(1,6,false,1,6); + check(1,6,true,1,7); + check(1,1,false,1,8); + check(1,1,true,1,9); + check(2,6,false,1,10); + check(2,6,true,1,11); + name:='é'; + check(1,1,false,0,12); + check(1,1,true,0,13); + check(1,2,false,2,14); + check(1,2,true,2,15); + check(2,1,false,-1,16); + check(2,1,true,-1,17); + check(2,3,false,-1,18); + check(2,3,true,-1,19); + name:='éa'; + check(1,1,false,0,20); + check(1,1,true,0,21); + check(1,2,false,2,22); + check(1,2,true,2,23); + check(2,1,false,-1,24); + check(2,1,true,-1,25); + check(2,3,false,-1,26); + check(2,3,true,-1,27); + check(3,1,false,1,28); + check(3,1,true,1,29); + check(3,4,false,1,30); + check(3,4,true,1,31); + name[3]:=name[2]; + check(1,1,false,0,32); + check(1,1,true,0,33); + check(1,2,false,2,34); + check(1,2,true,2,35); + check(2,1,false,-1,36); + check(2,1,true,-1,37); + check(2,3,false,-1,38); + check(2,3,true,-1,39); + check(3,1,false,-1,40); + check(3,1,true,-1,41); + check(3,4,false,-1,42); + check(3,4,true,-1,43); + { e + combining ` } + name:='e'#$0300'b'; + { check just the e without accent } + check(1,1,false,1,44); + check(1,1,true,1,45); + check(1,2,false,1,46); + { partial diacritical mark } + check(1,2,true,0,47); + check(1,3,false,1,48); + { complete diacritical mark } + check(1,3,true,3,49); + check(1,4,false,1,50); + { complete diacritical mark (ignore extra character) } + check(1,4,true,3,51); + { start of combining diacritical mark -- treated as independent utf-8 codepoint } + check(2,1,false,0,52); + check(2,1,true,0,53); + check(2,3,false,2,54); + check(2,3,true,2,55); + { middle of the combining diacritical mark } + check(3,1,false,-1,56); + check(3,1,true,-1,57); + check(3,4,false,-1,58); + check(3,4,true,-1,59); + { corrupt diacritical mark = no diacritical mark } + name[3]:=name[4]; + { partial diacritical mark (the corrupted byte is not included in the + lookahead) } + check(1,2,true,0,60); + check(1,3,false,1,61); + { ignore corrupt diacritical mark } + check(1,3,true,1,62); + check(1,4,false,1,63); + check(1,4,true,1,64); + { e + combining circle + combining superscript 'n' } + name:='e'#$20DD#$1DE0'b'; + { partial diacritical mark } + check(1,2,true,0,65); + check(1,3,false,1,66); + check(1,3,true,0,67); + check(1,4,false,1,68); + { complete diacritical mark } + check(1,4,true,4,69); + check(1,4,false,1,70); + { partial second diacritical mark } + check(1,5,true,0,71); + check(1,5,false,1,72); + check(1,6,true,0,73); + check(1,6,false,1,74); + { complete both diacritical marks } + check(1,7,true,7,75); + check(1,7,false,1,76); + check(1,10,true,7,77); + check(1,10,false,1,78); + { complete both diacritical marks without first character } + check(2,6,true,6,79); + check(2,20,true,6,80); + { only the first one, treated as independent codepoint } + check(2,7,false,3,81); + { corrupt second diacritical mark } + name[7]:=name[8]; + { partial second diacritical mark } + check(1,5,true,0,82); + check(1,5,false,1,83); + check(1,6,true,0,84); + check(1,6,false,1,85); + { including bad byte -> ignore second diacritical mark completely + (can't know it's part of a diacritical mark or something else) } + check(1,7,true,4,86); + check(1,7,false,1,87); + { complete both diacritical marks without first character, + but with corrupted byte } + check(2,7,true,3,88); + check(2,20,true,3,89); + { corrupted diacritical mark by itself } + { 1) incomplete } + check(5,1,false,0,90); + check(5,1,true,0,91); + check(5,2,false,0,92); + check(5,2,true,0,93); + { 2) invalid } + check(5,3,false,-2,94); + check(5,3,true,-2,95); +end.