From 7c789e1ef4033003f035458ee52ec173db30a804 Mon Sep 17 00:00:00 2001 From: maxim Date: Sat, 19 Mar 2016 13:32:54 +0000 Subject: [PATCH] Merged revision(s) 51970 #ae57408e17, 51972-51973 #34be9ae2d0-#34be9ae2d0 from trunk: docs: add fugue icons to acknowledgements ........ LazUtf8: implement Utf8TryFindCodepointStart. ........ LazUtf8: Refactor UTF8FindNearestCharStart. Resolves Issue #0029851. ........ git-svn-id: branches/fixes_1_6@51995 - --- components/lazutils/lazutf8.pas | 103 ++++++++++++++++++++++---------- docs/acknowledgements.txt | 3 + 2 files changed, 73 insertions(+), 33 deletions(-) diff --git a/components/lazutils/lazutf8.pas b/components/lazutils/lazutf8.pas index b6d8f7a21e..6c42a8ee1b 100644 --- a/components/lazutils/lazutf8.pas +++ b/components/lazutils/lazutf8.pas @@ -78,6 +78,8 @@ function UTF8ToDoubleByteString(const s: string): string; function UTF8ToDoubleByte(UTF8Str: PChar; Len: PtrInt; DBStr: PByte): PtrInt; function UTF8FindNearestCharStart(UTF8Str: PChar; Len: SizeInt; BytePos: SizeInt): SizeInt; +function Utf8TryFindCodepointStart(AString: PChar; var CurPos: PChar; out CharLen: Integer): Boolean; +function Utf8TryFindCodepointStart(const AString: String; var Index: Integer; out CharLen: Integer): Boolean; // find the n-th UTF8 character, ignoring BIDI function UTF8CharStart(UTF8Str: PChar; Len, CharIndex: PtrInt): PChar; // find the byte index of the n-th UTF8 character, ignoring BIDI (byte len of substr) @@ -579,46 +581,81 @@ begin end; end; -{ Find the start of the UTF8 character which contains BytePos, - Len is length in byte, BytePos starts at 0 } -function UTF8FindNearestCharStart(UTF8Str: PChar; Len: SizeInt; BytePos: SizeInt - ): SizeInt; + +{ Tries to find the start of a valid UTF8 codepoint that contains the character pointed to by CurPos + - AString: pointer to the (start of the) string + - CurPos: pointer to the character inside AString that we want to get the information off + * if the function succeeds, CurPos wil point to the start of the valid UTF8 codepoint + * if the function fails, CurPos will not be changed + Note: if CurPos points beyond the end of AString you will get a crash! + - CharLen: the length of the UTF8 codepoint in bytes, if the function succeeds + - Returns: + True if the character pointed to by Curpos is part of a valid UTF8 codepoint (1 to 4 bytes), + otherwise it returns False. } +function Utf8TryFindCodepointStart(AString: PChar; var CurPos: PChar; out CharLen: Integer): Boolean; +var + SavedPos: PChar; begin - Result:=0; - if (UTF8Str<>nil) and (Len>0) and (BytePos>=0) then begin - Result:=BytePos; - if Result>Len then Result:=Len-1; - if (Result>0) and (ord(UTF8Str[Result]) and %11000000=%10000000) then begin - dec(Result); - if (Result>0) and (ord(UTF8Str[Result]) and %11000000=%10000000) then begin - dec(Result); - if (Result>0) and (ord(UTF8Str[Result]) and %11000000=%10000000) then begin - dec(Result); - // should be four byte character - if (ord(UTF8Str[Result]) and %11111000<>%11110000) then begin - // broken UTF8 character - inc(Result,3); - end else begin - // is four byte character - end; - end else if (ord(UTF8Str[Result]) and %11110000<>%11100000) then begin - // broken UTF8 character, should be three byte - inc(Result,2); - end else - begin - // is three byte character - end; - end else if (ord(UTF8Str[Result]) and %11100000<>%11000000) then begin - // broken UTF8 character, should be two byte - inc(Result); - end else + Result := False; + CharLen := 0; + if (not (Assigned(AString) and Assigned(CurPos))) + or (CurPos < AString) then Exit; + SavedPos := CurPos; + //Note: UTF8CharacterStrictLength will NOT "look" beyond the terminating #0 of a PChar, so this is safe with AnsiStrings + CharLen := UTF8CharacterStrictLength(CurPos); + if (CharLen > 0) then Exit(True); + if (CurPos > AString) then + begin + Dec(CurPos); //-1 + //is it second byte of 2..4 byte codepoint? + CharLen := UTF8CharacterStrictLength(CurPos); + if (CharLen > 1) then Exit(True); + if (CurPos > AString) then + begin + Dec(CurPos); //-2 + //is it third byte of 3..4 byte codepoint? + CharLen := UTF8CharacterStrictLength(CurPos); + if (CharLen > 2) then Exit(True); + if (CurPos > AString) then begin - // is two byte character + Dec(CurPos); //-3 + //is it fouth byte of 4 byte codepoint? + CharLen := UTF8CharacterStrictLength(CurPos); + if (CharLen = 4) then Exit(True); end; end; end; + //At this point we failed: we are NOT inside a valid UTF8 codepoint! + CurPos := SavedPos; end; +function Utf8TryFindCodepointStart(const AString: String; var Index: Integer; out CharLen: Integer): Boolean; +var + CurPos, SavedCurPos: PChar; +begin + CurPos := @AString[Index]; + SavedCurPos := CurPos; + Result := Utf8TryFindCodepointStart(PChar(AString), CurPos, CharLen); + Index := Index - (SavedCurPos - CurPos); +end; + +{ Find the start of the UTF8 character which contains BytePos, + if BytePos is not part of a valid Utf8Codepoint the function returns BytePos + Len is length in byte, BytePos starts at 0 } +function UTF8FindNearestCharStart(UTF8Str: PChar; Len: SizeInt; BytePos: SizeInt): SizeInt; +var + CurPos: PChar; + CharLen: Integer; +begin + if (BytePos > Len-1) then BytePos := Len - 1; + CurPos := Utf8Str + BytePos; + //No need to check the result value, since when it retuns False CurPos will be reset + //to it's original value, and that's what we want to return in that case + Utf8TryFindCodepointStart(Utf8Str, CurPos, CharLen); + Result := CurPos - Utf8Str; +end; + + { Len is the length in bytes of UTF8Str CharIndex is the position of the desired char (starting at 0), in chars } diff --git a/docs/acknowledgements.txt b/docs/acknowledgements.txt index e7b4286cca..9d5a5bca53 100644 --- a/docs/acknowledgements.txt +++ b/docs/acknowledgements.txt @@ -15,6 +15,9 @@ http://www.famfamfam.com/lab/icons/silk/ Tango Icon Library http://tango.freedesktop.org/Tango_Icon_Library +Fugue Icons +http://p.yusukekamiyamane.com/ + #RemObjects Pascal Script Created by Carlo Kok / RemObjects Software