{$mode objfpc} {$codepage utf8} var name: utf8string; procedure check(index, lookahead: longint; combiningdiacritics: boolean; expectedresult: longint; checknr: longint); begin if Utf8CodePointLen(pchar(@name[index]),lookahead,combiningdiacritics)<>expectedresult then begin writeln('check ',checknr,': Utf8CodePointLen(',copy(name,index,length(name)),',',lookahead,',',combiningdiacritics,') = ',Utf8CodePointLen(pchar(@name[index]),lookahead,combiningdiacritics),' <> expected ',expectedresult); halt(1); end; end; begin name:='a'; check(1,0,false,0,1); check(1,1,false,1,2); check(1,1,true,1,3); check(1,6,false,1,4); check(1,6,true,1,5); name:='ab'; check(1,6,false,1,6); check(1,6,true,1,7); check(1,1,false,1,8); check(1,1,true,1,9); check(2,6,false,1,10); check(2,6,true,1,11); name:='é'; check(1,1,false,0,12); check(1,1,true,0,13); check(1,2,false,2,14); check(1,2,true,2,15); check(2,1,false,-1,16); check(2,1,true,-1,17); check(2,3,false,-1,18); check(2,3,true,-1,19); name:='éa'; check(1,1,false,0,20); check(1,1,true,0,21); check(1,2,false,2,22); check(1,2,true,2,23); check(2,1,false,-1,24); check(2,1,true,-1,25); check(2,3,false,-1,26); check(2,3,true,-1,27); check(3,1,false,1,28); check(3,1,true,1,29); check(3,4,false,1,30); check(3,4,true,1,31); name[3]:=name[2]; check(1,1,false,0,32); check(1,1,true,0,33); check(1,2,false,2,34); check(1,2,true,2,35); check(2,1,false,-1,36); check(2,1,true,-1,37); check(2,3,false,-1,38); check(2,3,true,-1,39); check(3,1,false,-1,40); check(3,1,true,-1,41); check(3,4,false,-1,42); check(3,4,true,-1,43); { e + combining ` } name:='e'#$0300'b'; { check just the e without accent } check(1,1,false,1,44); check(1,1,true,1,45); check(1,2,false,1,46); { partial diacritical mark } check(1,2,true,0,47); check(1,3,false,1,48); { complete diacritical mark } check(1,3,true,3,49); check(1,4,false,1,50); { complete diacritical mark (ignore extra character) } check(1,4,true,3,51); { start of combining diacritical mark -- treated as independent utf-8 codepoint } check(2,1,false,0,52); check(2,1,true,0,53); check(2,3,false,2,54); check(2,3,true,2,55); { middle of the combining diacritical mark } check(3,1,false,-1,56); check(3,1,true,-1,57); check(3,4,false,-1,58); check(3,4,true,-1,59); { corrupt diacritical mark = no diacritical mark } name[3]:=name[4]; { partial diacritical mark (the corrupted byte is not included in the lookahead) } check(1,2,true,0,60); check(1,3,false,1,61); { ignore corrupt diacritical mark } check(1,3,true,1,62); check(1,4,false,1,63); check(1,4,true,1,64); { e + combining circle + combining superscript 'n' } name:='e'#$20DD#$1DE0'b'; { partial diacritical mark } check(1,2,true,0,65); check(1,3,false,1,66); check(1,3,true,0,67); check(1,4,false,1,68); { complete diacritical mark } check(1,4,true,4,69); check(1,4,false,1,70); { partial second diacritical mark } check(1,5,true,0,71); check(1,5,false,1,72); check(1,6,true,0,73); check(1,6,false,1,74); { complete both diacritical marks } check(1,7,true,7,75); check(1,7,false,1,76); check(1,10,true,7,77); check(1,10,false,1,78); { complete both diacritical marks without first character } check(2,6,true,6,79); check(2,20,true,6,80); { only the first one, treated as independent codepoint } check(2,7,false,3,81); { corrupt second diacritical mark } name[7]:=name[8]; { partial second diacritical mark } check(1,5,true,0,82); check(1,5,false,1,83); check(1,6,true,0,84); check(1,6,false,1,85); { including bad byte -> ignore second diacritical mark completely (can't know it's part of a diacritical mark or something else) } check(1,7,true,4,86); check(1,7,false,1,87); { complete both diacritical marks without first character, but with corrupted byte } check(2,7,true,3,88); check(2,20,true,3,89); { corrupted diacritical mark by itself } { 1) incomplete } check(5,1,false,0,90); check(5,1,true,0,91); check(5,2,false,0,92); check(5,2,true,0,93); { 2) invalid } check(5,3,false,-3,94); check(5,3,true,-3,95); { Last allowed 4-byte codepoint, U+10FFFF. } name:=#$f4#$8f#$bf#$bf; check(1,4,false,4,96); { Last allowed 4-byte codepoint + 1, U+110000. } name:=#$f4#$90#$80#$80; check(1,4,false,-4,97); { First 5-byte codepoint, U+200000. } name:=#$f8#$88#$80#$80#$80; check(1,5,false,-1,98); { Overlong 2-byte U+7F. } name:=#$c1#$bf; check(1,2,false,-1,99); { Overlong 3-byte NULL. } name:=#$e0#$80#$80; check(1,3,false,-3,100); { Overlong 4-byte U+FFFF. } name:=#$f0#$8f#$bf#$bf; check(1,4,false,-4,101); { Cyrillic A + U+1AFF (last in the combining range 1AB0..1AFF). } name:='А᫿'; check(1,5,true,5,102); { Cyrillic A + U+1B00 (character just to the right of the combining range 1AB0..1AFF that happens to be combining anyway :D) } name:='Аᬀ'; check(1,5,true,5,103); { Cyrillic A + U+33F COMBINING DOUBLE OVERLINE. } name:='А̿'; check(1,4,true,4,104); { Cyrillic A + U+3099 (kana voice mark, 3-byte combining character outside of five ranges). } name:='А゙'; check(1,5,true,5,105); { Cyrillic A + U+1D167 (tremolo, 4-byte combining character outside of five ranges). } name:='А𝅧'; check(1,6,true,6,106); { Cyrillic A + U+E0100 (variation selector 17, 4-byte combining character outside of five ranges, special-cased in Utf8CodepointLen). } name:='А󠄀'; check(1,6,true,6,107); end.