From 0c22ee86781c2aae8457bddae8b88fbfc38b6cfb Mon Sep 17 00:00:00 2001 From: nickysn Date: Sun, 21 Feb 2021 17:21:13 +0000 Subject: [PATCH] + added an eawparser tool - tool for parsing the EastAsianWidth.txt unicode property table and converting it to code git-svn-id: trunk@48754 - --- .gitattributes | 2 + utils/unicode/eawparser.lpi | 58 +++++++ utils/unicode/eawparser.lpr | 332 ++++++++++++++++++++++++++++++++++++ utils/unicode/fpmake.pp | 1 + 4 files changed, 393 insertions(+) create mode 100644 utils/unicode/eawparser.lpi create mode 100644 utils/unicode/eawparser.lpr diff --git a/.gitattributes b/.gitattributes index 1a4db538d5..4669e48c87 100644 --- a/.gitattributes +++ b/.gitattributes @@ -19774,6 +19774,8 @@ utils/unicode/cldrtest.pas svneol=native#text/pascal utils/unicode/cldrtxt.pas svneol=native#text/plain utils/unicode/cldrxml.pas svneol=native#text/pascal utils/unicode/data/readme.txt svneol=native#text/plain +utils/unicode/eawparser.lpi svneol=native#text/plain +utils/unicode/eawparser.lpr svneol=native#text/pascal utils/unicode/fpmake.pp svneol=native#text/plain utils/unicode/gbpparser.lpi svneol=native#text/plain utils/unicode/gbpparser.lpr svneol=native#text/pascal diff --git a/utils/unicode/eawparser.lpi b/utils/unicode/eawparser.lpi new file mode 100644 index 0000000000..f7b6a46841 --- /dev/null +++ b/utils/unicode/eawparser.lpi @@ -0,0 +1,58 @@ + + + + + + + + + + + + + + <UseAppBundle Value="False"/> + <ResourceType Value="res"/> + </General> + <BuildModes Count="1"> + <Item1 Name="Default" Default="True"/> + </BuildModes> + <PublishOptions> + <Version Value="2"/> + <UseFileFilters Value="True"/> + </PublishOptions> + <RunParams> + <FormatVersion Value="2"/> + <Modes Count="0"/> + </RunParams> + <Units Count="1"> + <Unit0> + <Filename Value="eawparser.lpr"/> + <IsPartOfProject Value="True"/> + </Unit0> + </Units> + </ProjectOptions> + <CompilerOptions> + <Version Value="11"/> + <Target> + <Filename Value="eawparser"/> + </Target> + <SearchPaths> + <IncludeFiles Value="$(ProjOutDir)"/> + <UnitOutputDirectory Value="lib/$(TargetCPU)-$(TargetOS)"/> + </SearchPaths> + </CompilerOptions> + <Debugging> + <Exceptions Count="3"> + <Item1> + <Name Value="EAbort"/> + </Item1> + <Item2> + <Name Value="ECodetoolError"/> + </Item2> + <Item3> + <Name Value="EFOpenError"/> + </Item3> + </Exceptions> + </Debugging> +</CONFIG> diff --git a/utils/unicode/eawparser.lpr b/utils/unicode/eawparser.lpr new file mode 100644 index 0000000000..b178b78ab8 --- /dev/null +++ b/utils/unicode/eawparser.lpr @@ -0,0 +1,332 @@ +{ Parser and code generator for the EastAsianWidth. + + Copyright (C) 2021 Nikolay Nikolov <nickysn@users.sourceforge.net> + + This source is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your option) + any later version. + + This code is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + details. + + A copy of the GNU General Public License is available on the World Wide Web + at <http://www.gnu.org/copyleft/gpl.html>. You can also obtain it by writing + to the Free Software Foundation, Inc., 51 Franklin Street - Fifth Floor, + Boston, MA 02110-1335, USA. +} + +program eawparser; + +{$mode objfpc}{$H+} + +uses + SysUtils, StrUtils; + +type + TEastAsianWidth = ( + eawN, + eawA, + eawF, + eawH, + eawNa, + eawW); + + TRange = record + RangeLo, RangeHi: UCS4Char; + end; + TRanges = array of TRange; + +var + EastAsianWidths: array [UCS4Char] of TEastAsianWidth; + EAWStats: array [TEastAsianWidth] of record + Exists: Boolean; + Handled: Boolean; + MinValue: UCS4Char; + MaxValue: UCS4Char; + Count: LongInt; + Ranges: TRanges; + end; + +function ParseEastAsianWidth(S: string): TEastAsianWidth; +begin + S := Trim(S); + case S of + 'N': + Result := eawN; + 'A': + Result := eawA; + 'F': + Result := eawF; + 'H': + Result := eawH; + 'Na': + Result := eawNa; + 'W': + Result := eawW; + else + raise EArgumentException('Unknown east asian width: ''' + S + ''''); + end; +end; + +procedure ParseRange(S: string; out RangeLo, RangeHi: UCS4Char); +var + dp: SizeInt; +begin + S := Trim(S); + dp := Pos('..', S); + if dp > 0 then + begin + RangeLo := StrToInt('$' + LeftStr(S, dp - 1)); + RangeHi := StrToInt('$' + Copy(S, dp + 2, Length(S) - dp + 3)); + end + else + begin + RangeLo := StrToInt('$' + S); + RangeHi := RangeLo; + end; +end; + +procedure ParseEastAsianWidths(const FileName: string); +var + InF: TextFile; + S: string; + SplitS: TStringArray; + LineNr: Integer = 0; + eaw: TEastAsianWidth; + RangeLo, RangeHi, R: UCS4Char; +begin + { - All code points, assigned or unassigned, that are not listed + explicitly are given the value "N". } + for R in UCS4Char do + EastAsianWidths[R] := eawN; + { - The unassigned code points in the following blocks default to "W": + CJK Unified Ideographs Extension A: U+3400..U+4DBF + CJK Unified Ideographs: U+4E00..U+9FFF + CJK Compatibility Ideographs: U+F900..U+FAFF } + for R := $3400 to $4DBF do + EastAsianWidths[R] := eawW; + for R := $4E00 to $9FFF do + EastAsianWidths[R] := eawW; + for R := $F900 to $FAFF do + EastAsianWidths[R] := eawW; + { - All undesignated code points in Planes 2 and 3, whether inside or + outside of allocated blocks, default to "W": + Plane 2: U+20000..U+2FFFD + Plane 3: U+30000..U+3FFFD } + for R := $20000 to $2FFFD do + EastAsianWidths[R] := eawW; + for R := $30000 to $3FFFD do + EastAsianWidths[R] := eawW; + + if not FileExists(FileName) then + begin + Writeln('File doesn''t exist: ', FileName); + Halt(1); + end; + AssignFile(InF, FileName); + Reset(InF); + while not EoF(InF) do + begin + Inc(LineNr); + Readln(InF, S); + S := Trim(S); + if Pos('#', S) > 0 then + S := LeftStr(S, Pos('#', S) - 1); + if S <> '' then + begin + SplitS := S.Split([';']); + if Length(SplitS) <> 2 then + raise Exception.Create('Invalid number of ; separators on line ' + IntToStr(LineNr)); + ParseRange(SplitS[0], RangeLo, RangeHi); + eaw := ParseEastAsianWidth(SplitS[1]); + for R := RangeLo to RangeHi do + EastAsianWidths[R] := eaw; + end; + end; + CloseFile(InF); +end; + +procedure CalcStatsAndRanges; +var + Ch: UCS4Char; + eaw, prev_eaw: TEastAsianWidth; +begin + FillChar(EAWStats, SizeOf(EAWStats), 0); + eaw := Low(TEastAsianWidth); + for Ch := Low(UCS4Char) to High(UCS4Char) do + begin + prev_eaw := eaw; + eaw := EastAsianWidths[Ch]; + with EAWStats[eaw] do + begin + if not Exists then + begin + Exists := True; + MinValue := Ch; + MaxValue := Ch; + Count := 1; + SetLength(Ranges, 1); + Ranges[0].RangeLo := Ch; + Ranges[0].RangeHi := Ch; + end + else + begin + MaxValue := Ch; + Inc(Count); + if prev_eaw <> eaw then + begin + SetLength(Ranges, Length(Ranges) + 1); + with Ranges[High(Ranges)] do + begin + RangeLo := Ch; + RangeHi := Ch; + end; + end + else + Ranges[High(Ranges)].RangeHi := Ch; + end; + end; + end; +end; + +procedure MaybeCoalesceRanges(RLo, RHi: UCS4Char); +var + eaw: TEastAsianWidth; + RI: Integer; +begin + for eaw := Succ(Low(TEastAsianWidth)) to High(TEastAsianWidth) do + if EAWStats[eaw].Exists and (not EAWStats[eaw].Handled) then + begin + for RI := 0 to High(EAWStats[eaw].Ranges) - 1 do + if (EAWStats[eaw].Ranges[RI].RangeHi = (RLo - 1)) and + (EAWStats[eaw].Ranges[RI + 1].RangeLo = (RHi + 1)) then + begin + EAWStats[eaw].Ranges[RI].RangeHi := EAWStats[eaw].Ranges[RI + 1].RangeHi; + Delete(EAWStats[eaw].Ranges, RI + 1, 1); + exit; + end; + end; +end; + +function FindMinRangeCount: Integer; +var + eaw: TEastAsianWidth; +begin + Result := High(Integer); + for eaw := Succ(Low(TEastAsianWidth)) to High(TEastAsianWidth) do + if EAWStats[eaw].Exists and (not EAWStats[eaw].Handled) and (Length(EAWStats[eaw].Ranges) < Result) then + Result := Length(EAWStats[eaw].Ranges); +end; + +procedure GenCode(const OutFileName: string); +const + RangeCountThreshold = 30{400}; +var + eaw: TEastAsianWidth; + RI, NextRangeCount: Integer; + OutFile: TextFile; +begin + Writeln('Generating file: ', OutFileName); + + AssignFile(OutFile, OutFileName); + Rewrite(OutFile); + + Writeln(OutFile, '{ do not edit, this file is autogenerated by the eawparser tool }'); + + { unused properties are already handled } + for eaw := Succ(Low(TEastAsianWidth)) to High(TEastAsianWidth) do + if not EAWStats[eaw].Exists then + EAWStats[eaw].Handled := True; + + { handle single codepoints first } + for eaw := Succ(Low(TEastAsianWidth)) to High(TEastAsianWidth) do + if (not EAWStats[eaw].Handled) and (EAWStats[eaw].Count = 1) then + begin + if EAWStats[eaw].MinValue <> EAWStats[eaw].MaxValue then + raise Exception.Create('Internal error'); + Writeln(OutFile, 'if Ch=', EAWStats[eaw].MinValue, 'then result:=',eaw,' else'); + EAWStats[eaw].Handled := True; + MaybeCoalesceRanges(EAWStats[eaw].MinValue, EAWStats[eaw].MaxValue); + end; + + { handle single range codepoints next } + while FindMinRangeCount = 1 do + for eaw := Succ(Low(TEastAsianWidth)) to High(TEastAsianWidth) do + if (not EAWStats[eaw].Handled) and (Length(EAWStats[eaw].Ranges) = 1) then + begin + Writeln(OutFile, 'if(Ch>=', EAWStats[eaw].MinValue, ')and(Ch<=', EAWStats[eaw].MaxValue, ')then result:=',eaw,' else'); + EAWStats[eaw].Handled := True; + MaybeCoalesceRanges(EAWStats[eaw].MinValue, EAWStats[eaw].MaxValue); + end; + + repeat + NextRangeCount := FindMinRangeCount; + if NextRangeCount <= RangeCountThreshold then + for eaw := Succ(Low(TEastAsianWidth)) to High(TEastAsianWidth) do + begin + if not EAWStats[eaw].Handled and (Length(EAWStats[eaw].Ranges) <= NextRangeCount) then + begin + EAWStats[eaw].Handled := True; + Write(OutFile, 'if'); + for RI := 0 to High(EAWStats[eaw].Ranges) do + begin + if RI <> 0 then + Writeln(OutFile, 'or'); + with EAWStats[eaw].Ranges[RI] do + begin + if RangeLo = RangeHi then + Write(OutFile, '(Ch=', RangeLo, ')') + else + Write(OutFile, '((Ch>=', RangeLo, ')and(Ch<=', RangeHi, '))'); + MaybeCoalesceRanges(RangeLo, RangeHi); + end; + end; + Writeln(OutFile, 'then result:=',eaw,' else'); + end; + end; + until NextRangeCount > RangeCountThreshold; + + if NextRangeCount <> High(Integer) then + begin + //for eaw := Succ(Low(TGraphemeBreakProperty)) to High(TGraphemeBreakProperty) do + // if not EAWStats[eaw].Handled then + // Writeln(eaw, ' ', EAWStats[eaw].MinValue, '..', EAWStats[eaw].MaxValue, ' ', EAWStats[eaw].Count, ' ', Length(EAWStats[eaw].Ranges), ' ', (EAWStats[eaw].MaxValue - EAWStats[eaw].MinValue + 7) div 8); + Writeln(OutFile, 'case Ch of'); + for eaw := Succ(Low(TEastAsianWidth)) to High(TEastAsianWidth) do + begin + if not EAWStats[eaw].Handled then + begin + EAWStats[eaw].Handled := True; + for RI := 0 to High(EAWStats[eaw].Ranges) do + begin + if RI <> 0 then + Writeln(OutFile, ','); + with EAWStats[eaw].Ranges[RI] do + begin + if RangeLo = RangeHi then + Write(OutFile, RangeLo) + else + Write(OutFile, RangeLo, '..', RangeHi); + end; + end; + Writeln(OutFile, ':result:=', eaw, ';'); + end; + end; + Writeln(OutFile, 'else result:=eawN end'); + end + else + Writeln(OutFile, 'result:=eawN'); + + CloseFile(OutFile); +end; + +begin + ParseEastAsianWidths('data/UCD/EastAsianWidth.txt'); + CalcStatsAndRanges; + GenCode('eastasianwidth_code.inc'); + Writeln('Done'); +end. + diff --git a/utils/unicode/fpmake.pp b/utils/unicode/fpmake.pp index e82c97e560..ed826dc7b8 100644 --- a/utils/unicode/fpmake.pp +++ b/utils/unicode/fpmake.pp @@ -61,6 +61,7 @@ begin T:=P.Targets.AddProgram('cldrparser.lpr'); T:=P.Targets.AddProgram('unihelper.lpr'); T:=P.Targets.AddProgram('gbpparser.lpr'); + T:=P.Targets.AddProgram('eawparser.lpr'); end; end;