fpc/tests/test/tutf8cpl.pp

197 lines
5.5 KiB
ObjectPascal
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{$mode objfpc}
{$codepage utf8}
var
name: utf8string;
procedure check(index, lookahead: longint; combiningdiacritics: boolean; expectedresult: longint; checknr: longint);
begin
if Utf8CodePointLen(pchar(@name[index]),lookahead,combiningdiacritics)<>expectedresult then
begin
writeln('check ',checknr,': Utf8CodePointLen(',copy(name,index,length(name)),',',lookahead,',',combiningdiacritics,') = ',Utf8CodePointLen(pchar(@name[index]),lookahead,combiningdiacritics),' <> expected ',expectedresult);
halt(1);
end;
end;
begin
name:='a';
check(1,0,false,0,1);
check(1,1,false,1,2);
check(1,1,true,1,3);
check(1,6,false,1,4);
check(1,6,true,1,5);
name:='ab';
check(1,6,false,1,6);
check(1,6,true,1,7);
check(1,1,false,1,8);
check(1,1,true,1,9);
check(2,6,false,1,10);
check(2,6,true,1,11);
name:='é';
check(1,1,false,0,12);
check(1,1,true,0,13);
check(1,2,false,2,14);
check(1,2,true,2,15);
check(2,1,false,-1,16);
check(2,1,true,-1,17);
check(2,3,false,-1,18);
check(2,3,true,-1,19);
name:='éa';
check(1,1,false,0,20);
check(1,1,true,0,21);
check(1,2,false,2,22);
check(1,2,true,2,23);
check(2,1,false,-1,24);
check(2,1,true,-1,25);
check(2,3,false,-1,26);
check(2,3,true,-1,27);
check(3,1,false,1,28);
check(3,1,true,1,29);
check(3,4,false,1,30);
check(3,4,true,1,31);
name[3]:=name[2];
check(1,1,false,0,32);
check(1,1,true,0,33);
check(1,2,false,2,34);
check(1,2,true,2,35);
check(2,1,false,-1,36);
check(2,1,true,-1,37);
check(2,3,false,-1,38);
check(2,3,true,-1,39);
check(3,1,false,-1,40);
check(3,1,true,-1,41);
check(3,4,false,-1,42);
check(3,4,true,-1,43);
{ e + combining ` }
name:='e'#$0300'b';
{ check just the e without accent }
check(1,1,false,1,44);
check(1,1,true,1,45);
check(1,2,false,1,46);
{ partial diacritical mark }
check(1,2,true,0,47);
check(1,3,false,1,48);
{ complete diacritical mark }
check(1,3,true,3,49);
check(1,4,false,1,50);
{ complete diacritical mark (ignore extra character) }
check(1,4,true,3,51);
{ start of combining diacritical mark -- treated as independent utf-8 codepoint }
check(2,1,false,0,52);
check(2,1,true,0,53);
check(2,3,false,2,54);
check(2,3,true,2,55);
{ middle of the combining diacritical mark }
check(3,1,false,-1,56);
check(3,1,true,-1,57);
check(3,4,false,-1,58);
check(3,4,true,-1,59);
{ corrupt diacritical mark = no diacritical mark }
name[3]:=name[4];
{ partial diacritical mark (the corrupted byte is not included in the
lookahead) }
check(1,2,true,0,60);
check(1,3,false,1,61);
{ ignore corrupt diacritical mark }
check(1,3,true,1,62);
check(1,4,false,1,63);
check(1,4,true,1,64);
{ e + combining circle + combining superscript 'n' }
name:='e'#$20DD#$1DE0'b';
{ partial diacritical mark }
check(1,2,true,0,65);
check(1,3,false,1,66);
check(1,3,true,0,67);
check(1,4,false,1,68);
{ complete diacritical mark }
check(1,4,true,4,69);
check(1,4,false,1,70);
{ partial second diacritical mark }
check(1,5,true,0,71);
check(1,5,false,1,72);
check(1,6,true,0,73);
check(1,6,false,1,74);
{ complete both diacritical marks }
check(1,7,true,7,75);
check(1,7,false,1,76);
check(1,10,true,7,77);
check(1,10,false,1,78);
{ complete both diacritical marks without first character }
check(2,6,true,6,79);
check(2,20,true,6,80);
{ only the first one, treated as independent codepoint }
check(2,7,false,3,81);
{ corrupt second diacritical mark }
name[7]:=name[8];
{ partial second diacritical mark }
check(1,5,true,0,82);
check(1,5,false,1,83);
check(1,6,true,0,84);
check(1,6,false,1,85);
{ including bad byte -> ignore second diacritical mark completely
(can't know it's part of a diacritical mark or something else) }
check(1,7,true,4,86);
check(1,7,false,1,87);
{ complete both diacritical marks without first character,
but with corrupted byte }
check(2,7,true,3,88);
check(2,20,true,3,89);
{ corrupted diacritical mark by itself }
{ 1) incomplete }
check(5,1,false,0,90);
check(5,1,true,0,91);
check(5,2,false,0,92);
check(5,2,true,0,93);
{ 2) invalid }
check(5,3,false,-3,94);
check(5,3,true,-3,95);
{ Last allowed 4-byte codepoint, U+10FFFF. }
name:=#$f4#$8f#$bf#$bf;
check(1,4,false,4,96);
{ Last allowed 4-byte codepoint + 1, U+110000. }
name:=#$f4#$90#$80#$80;
check(1,4,false,-4,97);
{ First 5-byte codepoint, U+200000. }
name:=#$f8#$88#$80#$80#$80;
check(1,5,false,-1,98);
{ Overlong 2-byte U+7F. }
name:=#$c1#$bf;
check(1,2,false,-1,99);
{ Overlong 3-byte NULL. }
name:=#$e0#$80#$80;
check(1,3,false,-3,100);
{ Overlong 4-byte U+FFFF. }
name:=#$f0#$8f#$bf#$bf;
check(1,4,false,-4,101);
{ Cyrillic A + U+1AFF (last in the combining range 1AB0..1AFF). }
name:='А᫿';
check(1,5,true,5,102);
{ Cyrillic A + U+1B00 (character just to the right of the combining range 1AB0..1AFF that happens to be combining anyway :D) }
name:='Аᬀ';
check(1,5,true,5,103);
{ Cyrillic A + U+33F COMBINING DOUBLE OVERLINE. }
name:='А̿';
check(1,4,true,4,104);
{ Cyrillic A + U+3099 (kana voice mark, 3-byte combining character outside of five ranges). }
name:='А゙';
check(1,5,true,5,105);
{ Cyrillic A + U+1D167 (tremolo, 4-byte combining character outside of five ranges). }
name:='А𝅧';
check(1,6,true,6,106);
{ Cyrillic A + U+E0100 (variation selector 17, 4-byte combining character outside of five ranges, special-cased in Utf8CodepointLen). }
name:='А󠄀';
check(1,6,true,6,107);
end.