fpc/tests/test/tutf8cpl.pp

{$mode objfpc}
{$codepage utf8}

var
  name: utf8string;

procedure check(index, lookahead: longint; combiningdiacritics: boolean; expectedresult: longint; checknr: longint);
begin
  if Utf8CodePointLen(pchar(@name[index]),lookahead,combiningdiacritics)<>expectedresult then
    begin
      writeln('check ',checknr,': Utf8CodePointLen(',copy(name,index,length(name)),',',lookahead,',',combiningdiacritics,') = ',Utf8CodePointLen(pchar(@name[index]),lookahead,combiningdiacritics),' <> expected ',expectedresult);
      halt(1);
    end;
end;

begin
  name:='a';
  check(1,0,false,0,1);
  check(1,1,false,1,2);
  check(1,1,true,1,3);
  check(1,6,false,1,4);
  check(1,6,true,1,5);
  name:='ab';
  check(1,6,false,1,6);
  check(1,6,true,1,7);
  check(1,1,false,1,8);
  check(1,1,true,1,9);
  check(2,6,false,1,10);
  check(2,6,true,1,11);
  name:='é';
  check(1,1,false,0,12);
  check(1,1,true,0,13);
  check(1,2,false,2,14);
  check(1,2,true,2,15);
  check(2,1,false,-1,16);
  check(2,1,true,-1,17);
  check(2,3,false,-1,18);
  check(2,3,true,-1,19);
  name:='éa';
  check(1,1,false,0,20);
  check(1,1,true,0,21);
  check(1,2,false,2,22);
  check(1,2,true,2,23);
  check(2,1,false,-1,24);
  check(2,1,true,-1,25);
  check(2,3,false,-1,26);
  check(2,3,true,-1,27);
  check(3,1,false,1,28);
  check(3,1,true,1,29);
  check(3,4,false,1,30);
  check(3,4,true,1,31);
  name[3]:=name[2];
  check(1,1,false,0,32);
  check(1,1,true,0,33);
  check(1,2,false,2,34);
  check(1,2,true,2,35);
  check(2,1,false,-1,36);
  check(2,1,true,-1,37);
  check(2,3,false,-1,38);
  check(2,3,true,-1,39);
  check(3,1,false,-1,40);
  check(3,1,true,-1,41);
  check(3,4,false,-1,42);
  check(3,4,true,-1,43);
  { e + combining ` }
  name:='e'#$0300'b';
  { check just the e without accent }
  check(1,1,false,1,44);
  check(1,1,true,1,45);
  check(1,2,false,1,46);
  { partial diacritical mark }
  check(1,2,true,0,47);
  check(1,3,false,1,48);
  { complete diacritical mark }
  check(1,3,true,3,49);
  check(1,4,false,1,50);
  { complete diacritical mark (ignore extra character) }
  check(1,4,true,3,51);
  { start of combining diacritical mark -- treated as independent utf-8 codepoint }
  check(2,1,false,0,52);
  check(2,1,true,0,53);
  check(2,3,false,2,54);
  check(2,3,true,2,55);
  { middle of the combining diacritical mark }
  check(3,1,false,-1,56);
  check(3,1,true,-1,57);
  check(3,4,false,-1,58);
  check(3,4,true,-1,59);
  { corrupt diacritical mark = no diacritical mark }
  name[3]:=name[4];
  { partial diacritical mark (the corrupted byte is not included in the
    lookahead) }
  check(1,2,true,0,60);
  check(1,3,false,1,61);
  { ignore corrupt diacritical mark }
  check(1,3,true,1,62);
  check(1,4,false,1,63);
  check(1,4,true,1,64);
  { e + combining circle + combining superscript 'n' }
  name:='e'#$20DD#$1DE0'b';
  { partial diacritical mark }
  check(1,2,true,0,65);
  check(1,3,false,1,66);
  check(1,3,true,0,67);
  check(1,4,false,1,68);
  { complete diacritical mark }
  check(1,4,true,4,69);
  check(1,4,false,1,70);
  { partial second diacritical mark }
  check(1,5,true,0,71);
  check(1,5,false,1,72);
  check(1,6,true,0,73);
  check(1,6,false,1,74);
  { complete both diacritical marks }
  check(1,7,true,7,75);
  check(1,7,false,1,76);
  check(1,10,true,7,77);
  check(1,10,false,1,78);
  { complete both diacritical marks without first character }
  check(2,6,true,6,79);
  check(2,20,true,6,80);
  { only the first one, treated as independent codepoint }
  check(2,7,false,3,81);
  { corrupt second diacritical mark }
  name[7]:=name[8];
  { partial second diacritical mark }
  check(1,5,true,0,82);
  check(1,5,false,1,83);
  check(1,6,true,0,84);
  check(1,6,false,1,85);
  { including bad byte -> ignore second diacritical mark completely
    (can't know it's part of a diacritical mark or something else) }
  check(1,7,true,4,86);
  check(1,7,false,1,87);
  { complete both diacritical marks without first character,
    but with corrupted byte }
  check(2,7,true,3,88);
  check(2,20,true,3,89);
  { corrupted diacritical mark by itself }
  { 1) incomplete }
  check(5,1,false,0,90);
  check(5,1,true,0,91);
  check(5,2,false,0,92);
  check(5,2,true,0,93);
  { 2) invalid }
  check(5,3,false,-3,94);
  check(5,3,true,-3,95);

  { Last allowed 4-byte codepoint, U+10FFFF. }
  name:=#$f4#$8f#$bf#$bf;
  check(1,4,false,4,96);

  { Last allowed 4-byte codepoint + 1, U+110000. }
  name:=#$f4#$90#$80#$80;
  check(1,4,false,-4,97);

  { First 5-byte codepoint, U+200000. }
  name:=#$f8#$88#$80#$80#$80;
  check(1,5,false,-1,98);

  { Overlong 2-byte U+7F. }
  name:=#$c1#$bf;
  check(1,2,false,-1,99);

  { Overlong 3-byte NULL. }
  name:=#$e0#$80#$80;
  check(1,3,false,-3,100);

  { Overlong 4-byte U+FFFF. }
  name:=#$f0#$8f#$bf#$bf;
  check(1,4,false,-4,101);

  { Cyrillic A + U+1AFF (last in the combining range 1AB0..1AFF). }
  name:='А᫿';
  check(1,5,true,5,102);

  { Cyrillic A + U+1B00 (character just to the right of the combining range 1AB0..1AFF that happens to be combining anyway :D) }
  name:='Аᬀ';
  check(1,5,true,5,103);

  { Cyrillic A + U+33F COMBINING DOUBLE OVERLINE. }
  name:='А̿';
  check(1,4,true,4,104);

  { Cyrillic A + U+3099 (kana voice mark, 3-byte combining character outside of five ranges). }
  name:='А゙';
  check(1,5,true,5,105);

  { Cyrillic A + U+1D167 (tremolo, 4-byte combining character outside of five ranges). }
  name:='А𝅧';
  check(1,6,true,6,106);

  { Cyrillic A + U+E0100 (variation selector 17, 4-byte combining character outside of five ranges, special-cased in Utf8CodepointLen). }
  name:='А󠄀';
  check(1,6,true,6,107);
end.