mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-08-15 17:09:07 +02:00
* Added UseUnicodeWordDetection
git-svn-id: trunk@39564 -
This commit is contained in:
parent
08d574bf7d
commit
71bbab3512
1
.gitattributes
vendored
1
.gitattributes
vendored
@ -7256,6 +7256,7 @@ packages/regexpr/Makefile.fpc svneol=native#text/plain
|
|||||||
packages/regexpr/Makefile.fpc.fpcmake svneol=native#text/plain
|
packages/regexpr/Makefile.fpc.fpcmake svneol=native#text/plain
|
||||||
packages/regexpr/examples/Makefile svneol=native#text/plain
|
packages/regexpr/examples/Makefile svneol=native#text/plain
|
||||||
packages/regexpr/examples/Makefile.fpc svneol=native#text/plain
|
packages/regexpr/examples/Makefile.fpc svneol=native#text/plain
|
||||||
|
packages/regexpr/examples/demowd.pp svneol=native#text/plain
|
||||||
packages/regexpr/examples/splitwords.lpi svneol=native#text/plain
|
packages/regexpr/examples/splitwords.lpi svneol=native#text/plain
|
||||||
packages/regexpr/examples/splitwords.pp svneol=native#text/plain
|
packages/regexpr/examples/splitwords.pp svneol=native#text/plain
|
||||||
packages/regexpr/examples/testreg1.pp svneol=native#text/plain
|
packages/regexpr/examples/testreg1.pp svneol=native#text/plain
|
||||||
|
27
packages/regexpr/examples/demowd.pp
Normal file
27
packages/regexpr/examples/demowd.pp
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
{
|
||||||
|
Program to demonstrate UseUnicodeWordDetection property.
|
||||||
|
Run this program as
|
||||||
|
testwd
|
||||||
|
testwd 1
|
||||||
|
to see the difference
|
||||||
|
}
|
||||||
|
{$mode objfpc}
|
||||||
|
{$h+}
|
||||||
|
uses cwstring,uregexpr;
|
||||||
|
|
||||||
|
Function ReplaceRegExpr(ARegExpr, AInputStr, AReplaceStr : Unicodestring) : string;
|
||||||
|
|
||||||
|
begin
|
||||||
|
with TRegExpr.Create do
|
||||||
|
try
|
||||||
|
UseUnicodeWordDetection:=ParamStr(1)='1';
|
||||||
|
Expression := ARegExpr;
|
||||||
|
Result:=Replace (AInputStr, AReplaceStr, True);
|
||||||
|
finally
|
||||||
|
Free;
|
||||||
|
end;
|
||||||
|
end;
|
||||||
|
|
||||||
|
begin
|
||||||
|
Writeln(ReplaceRegExpr('\w+', UTF8Decode('test слово ŕáćéí ϸϬϛ ュユョ'), '<$0>'));
|
||||||
|
end.
|
@ -49,6 +49,7 @@ unit RegExpr;
|
|||||||
interface
|
interface
|
||||||
|
|
||||||
{off $DEFINE DebugSynRegExpr}
|
{off $DEFINE DebugSynRegExpr}
|
||||||
|
{$DEFINE UnicodeWordDetection}
|
||||||
|
|
||||||
{$IFDEF FPC}
|
{$IFDEF FPC}
|
||||||
{$MODE DELPHI} // Delphi-compatible mode in FreePascal
|
{$MODE DELPHI} // Delphi-compatible mode in FreePascal
|
||||||
@ -101,6 +102,9 @@ interface
|
|||||||
{$DEFINE UseFirstCharSet} // Fast skip between matches for r.e. that starts with determined set of chars
|
{$DEFINE UseFirstCharSet} // Fast skip between matches for r.e. that starts with determined set of chars
|
||||||
{$ENDIF}
|
{$ENDIF}
|
||||||
{$DEFINE UseOsLineEndOnReplace} // On Replace if replace-with has "\n", use System.LineEnding (#10 #13 or #13#10); else use #10
|
{$DEFINE UseOsLineEndOnReplace} // On Replace if replace-with has "\n", use System.LineEnding (#10 #13 or #13#10); else use #10
|
||||||
|
{$IFNDEF UNICODE}
|
||||||
|
{$UNDEF UnicodeWordDetection}
|
||||||
|
{$ENDIF}
|
||||||
|
|
||||||
// ======== Define Pascal-language options
|
// ======== Define Pascal-language options
|
||||||
// Define 'UseAsserts' option (do not edit this definitions).
|
// Define 'UseAsserts' option (do not edit this definitions).
|
||||||
@ -292,7 +296,12 @@ type
|
|||||||
{$IFNDEF UniCode}
|
{$IFNDEF UniCode}
|
||||||
fLineSeparatorsSet : set of REChar;
|
fLineSeparatorsSet : set of REChar;
|
||||||
{$ENDIF}
|
{$ENDIF}
|
||||||
Function IsSpaceChar(AChar : PRegExprChar) : Boolean; inline;
|
{$IFDEF UnicodeWordDetection}
|
||||||
|
FUseUnicodeWordDetection : Boolean;
|
||||||
|
function IsUnicodeWordChar(AChar : REChar) : Boolean;
|
||||||
|
{$ENDIF}
|
||||||
|
function IsWordChar(AChar : REChar) : Boolean; inline;
|
||||||
|
function IsSpaceChar(AChar : PRegExprChar) : Boolean; inline;
|
||||||
|
|
||||||
// Mark programm as having to be [re]compiled
|
// Mark programm as having to be [re]compiled
|
||||||
procedure InvalidateProgramm;
|
procedure InvalidateProgramm;
|
||||||
@ -574,6 +583,10 @@ type
|
|||||||
// global constant)
|
// global constant)
|
||||||
property WordChars : RegExprString read fWordChars write fWordChars; //###0.929
|
property WordChars : RegExprString read fWordChars write fWordChars; //###0.929
|
||||||
|
|
||||||
|
{$IFDEF UnicodeWordDetection}
|
||||||
|
// If set to true, in addition to using WordChars, a heuristic to detect unicode word letters is used for \w
|
||||||
|
Property UseUnicodeWordDetection : Boolean Read FUseUnicodeWordDetection Write FUseUnicodeWordDetection;
|
||||||
|
{$ENDIF}
|
||||||
// line separators (like \n in Unix)
|
// line separators (like \n in Unix)
|
||||||
property LineSeparators : RegExprString read fLineSeparators write SetLineSeparators; //###0.941
|
property LineSeparators : RegExprString read fLineSeparators write SetLineSeparators; //###0.941
|
||||||
|
|
||||||
@ -661,6 +674,10 @@ function RegExprSubExpressions (const ARegExpr : string;
|
|||||||
implementation
|
implementation
|
||||||
|
|
||||||
{$IFDEF FPC}
|
{$IFDEF FPC}
|
||||||
|
{$IFDEF UnicodeWordDetection}
|
||||||
|
uses
|
||||||
|
UnicodeData;
|
||||||
|
{$ENDIF}
|
||||||
{$ELSE}
|
{$ELSE}
|
||||||
uses
|
uses
|
||||||
{$IFDEF SYN_WIN32}
|
{$IFDEF SYN_WIN32}
|
||||||
@ -1205,6 +1222,9 @@ constructor TRegExpr.Create;
|
|||||||
|
|
||||||
regexpbeg := nil;
|
regexpbeg := nil;
|
||||||
fExprIsCompiled := false;
|
fExprIsCompiled := false;
|
||||||
|
{$IFDEF UnicodeWordDetection}
|
||||||
|
FUseUnicodeWordDetection:=False;
|
||||||
|
{$ENDIF}
|
||||||
|
|
||||||
ModifierI := RegExprModifierI;
|
ModifierI := RegExprModifierI;
|
||||||
ModifierR := RegExprModifierR;
|
ModifierR := RegExprModifierR;
|
||||||
@ -1481,6 +1501,35 @@ procedure TRegExpr.SetModifier (AIndex : integer; ASet : boolean);
|
|||||||
{==================== Compiler section =======================}
|
{==================== Compiler section =======================}
|
||||||
{=============================================================}
|
{=============================================================}
|
||||||
|
|
||||||
|
{$IFDEF UnicodeWordDetection}
|
||||||
|
function TRegExpr.IsUnicodeWordChar(AChar: REChar): Boolean;
|
||||||
|
var
|
||||||
|
NType: byte;
|
||||||
|
begin
|
||||||
|
if Ord(AChar)<128 then
|
||||||
|
exit(false)
|
||||||
|
else
|
||||||
|
if Ord(AChar)>=LOW_SURROGATE_BEGIN then
|
||||||
|
exit(false)
|
||||||
|
else
|
||||||
|
begin
|
||||||
|
NType:= GetProps(Ord(AChar))^.Category;
|
||||||
|
Result:= (NType<=UGC_OtherNumber);
|
||||||
|
end;
|
||||||
|
end;
|
||||||
|
{$ENDIF}
|
||||||
|
|
||||||
|
|
||||||
|
function TRegExpr.IsWordChar(AChar: REChar): Boolean; inline;
|
||||||
|
begin
|
||||||
|
Result := Pos(AChar, fWordChars)>0;
|
||||||
|
{$IFDEF UnicodeWordDetection}
|
||||||
|
If Not Result and UseUnicodeWordDetection then
|
||||||
|
Result:=IsUnicodeWordChar(aChar);
|
||||||
|
{$ENDIF}
|
||||||
|
end;
|
||||||
|
|
||||||
|
|
||||||
function TRegExpr.IsSpaceChar(AChar: PRegExprChar): Boolean;
|
function TRegExpr.IsSpaceChar(AChar: PRegExprChar): Boolean;
|
||||||
begin
|
begin
|
||||||
Result:=Pos(AChar^,fSpaceChars)>0;
|
Result:=Pos(AChar^,fSpaceChars)>0;
|
||||||
@ -2793,7 +2842,7 @@ function TRegExpr.regrepeat (p : PRegExprChar; AMax : PtrInt) : PtrInt;
|
|||||||
{$IFNDEF UseSetOfChar} //###0.929
|
{$IFNDEF UseSetOfChar} //###0.929
|
||||||
ANYLETTER:
|
ANYLETTER:
|
||||||
while (Result < TheMax) and
|
while (Result < TheMax) and
|
||||||
(Pos (scan^, fWordChars) > 0) //###0.940
|
IsWordChar(scan^) //###0.940
|
||||||
{ ((scan^ >= 'a') and (scan^ <= 'z') !! I've forgotten (>='0') and (<='9')
|
{ ((scan^ >= 'a') and (scan^ <= 'z') !! I've forgotten (>='0') and (<='9')
|
||||||
or (scan^ >= 'A') and (scan^ <= 'Z') or (scan^ = '_'))} do begin
|
or (scan^ >= 'A') and (scan^ <= 'Z') or (scan^ = '_'))} do begin
|
||||||
inc (Result);
|
inc (Result);
|
||||||
@ -2801,7 +2850,7 @@ function TRegExpr.regrepeat (p : PRegExprChar; AMax : PtrInt) : PtrInt;
|
|||||||
end;
|
end;
|
||||||
NOTLETTER:
|
NOTLETTER:
|
||||||
while (Result < TheMax) and
|
while (Result < TheMax) and
|
||||||
(Pos (scan^, fWordChars) <= 0) //###0.940
|
not IsWordChar(scan^) //###0.940
|
||||||
{ not ((scan^ >= 'a') and (scan^ <= 'z') !! I've forgotten (>='0') and (<='9')
|
{ not ((scan^ >= 'a') and (scan^ <= 'z') !! I've forgotten (>='0') and (<='9')
|
||||||
or (scan^ >= 'A') and (scan^ <= 'Z')
|
or (scan^ >= 'A') and (scan^ <= 'Z')
|
||||||
or (scan^ = '_'))} do begin
|
or (scan^ = '_'))} do begin
|
||||||
@ -2933,11 +2982,11 @@ function TRegExpr.MatchPrim (prog : PRegExprChar) : boolean;
|
|||||||
BOUND:
|
BOUND:
|
||||||
if (scan^ = BOUND)
|
if (scan^ = BOUND)
|
||||||
xor (
|
xor (
|
||||||
((reginput = fInputStart) or (Pos ((reginput - 1)^, fWordChars) <= 0))
|
((reginput = fInputStart) or not IsWordChar((reginput - 1)^))
|
||||||
and (reginput^ <> #0) and (Pos (reginput^, fWordChars) > 0)
|
and (reginput^ <> #0) and IsWordChar(reginput^)
|
||||||
or
|
or
|
||||||
(reginput <> fInputStart) and (Pos ((reginput - 1)^, fWordChars) > 0)
|
(reginput <> fInputStart) and IsWordChar((reginput - 1)^)
|
||||||
and ((reginput^ = #0) or (Pos (reginput^, fWordChars) <= 0)))
|
and ((reginput^ = #0) or not IsWordChar(reginput^)))
|
||||||
then EXIT;
|
then EXIT;
|
||||||
|
|
||||||
BOL: if reginput <> fInputStart
|
BOL: if reginput <> fInputStart
|
||||||
@ -3006,12 +3055,12 @@ function TRegExpr.MatchPrim (prog : PRegExprChar) : boolean;
|
|||||||
end;
|
end;
|
||||||
{$IFNDEF UseSetOfChar} //###0.929
|
{$IFNDEF UseSetOfChar} //###0.929
|
||||||
ANYLETTER: begin
|
ANYLETTER: begin
|
||||||
if (reginput^ = #0) or (Pos (reginput^, fWordChars) <= 0) //###0.943
|
if (reginput^ = #0) or not IsWordChar(reginput^) //###0.943
|
||||||
then EXIT;
|
then EXIT;
|
||||||
inc (reginput);
|
inc (reginput);
|
||||||
end;
|
end;
|
||||||
NOTLETTER: begin
|
NOTLETTER: begin
|
||||||
if (reginput^ = #0) or (Pos (reginput^, fWordChars) > 0) //###0.943
|
if (reginput^ = #0) or IsWordChar(reginput^) //###0.943
|
||||||
then EXIT;
|
then EXIT;
|
||||||
inc (reginput);
|
inc (reginput);
|
||||||
end;
|
end;
|
||||||
|
Loading…
Reference in New Issue
Block a user