123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497 |
- { UnicodeSet implementation.
- Copyright (c) 2013-2015 by Inoussa OUEDRAOGO
- The source code is distributed under the Library GNU
- General Public License with the following modification:
- - object files and libraries linked into an application may be
- distributed without source code.
- If you didn't receive a copy of the file COPYING, contact:
- Free Software Foundation
- 675 Mass Ave
- Cambridge, MA 02139
- USA
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- }
- unit unicodeset;
- {$mode delphi}{$H+}
- {$scopedenums on}
- interface
- uses
- SysUtils,
- grbtree, helper;
- type
- EUnicodeSetException = class(Exception)
- end;
- TUnicodeSet = class;
- { TPatternParser }
- TPatternParser = class
- private
- FBufferStr : UnicodeString;
- FBuffer : PUnicodeChar;
- FBufferLength : Integer;
- FSet : TUnicodeSet;
- FPosition : Integer;
- FSpecialChar: Boolean;
- private
- procedure Error(const AMsg : string; const AArgs : array of const);overload;inline;
- procedure Error(const AMsg : string);overload;inline;
- procedure SetBuffer(const APattern : PUnicodeChar; const ALength : Integer);
- procedure CheckEOF();inline;overload;
- procedure CheckEOF(ALength : Integer);overload;inline;
- procedure UnexpectedEOF();inline;
- function IsThis(AItem : UnicodeString; const APosition : Integer) : Boolean;overload;
- function IsThis(AItem : UnicodeString) : Boolean;overload;inline;
- procedure Expect(AItem : UnicodeString; const APosition : Integer);overload;inline;
- procedure Expect(AItem : UnicodeString);overload;inline;
- procedure SkipSpaces();inline;
- function NextChar() : TUnicodeCodePoint;
- procedure ParseItem();
- procedure DoParse();
- property SpecialChar : Boolean read FSpecialChar;
- public
- procedure Parse(const APattern : PUnicodeChar; const ALength : Integer);overload;
- procedure Parse(const APattern : UnicodeString);overload;inline;
- property CurrentSet : TUnicodeSet read FSet write FSet;
- end;
- TUnicodeCodePointArrayComparator = class
- public
- // Return
- // * if A>B then 1
- // * if A=B then 0
- // * if A<B then -1
- class function Compare(const A, B : TUnicodeCodePointArray) : Integer;static;inline;
- end;
- { TUnicodeSet }
- TUnicodeSet = class
- private type
- TItem = TUnicodeCodePointArray;
- TTree = TRBTree<TItem,TUnicodeCodePointArrayComparator>;
- public type
- TIterator = TTree.TIterator;
- private
- FTree : TTree;
- FParser : TPatternParser;
- private
- procedure CreateParser();inline;
- function InternalContains(const AString : UnicodeString) : Boolean;overload;
- public
- constructor Create();
- destructor Destroy;override;
- procedure Add(AChar : TUnicodeCodePoint);inline;overload;
- procedure Add(AString : TUnicodeCodePointArray);inline;overload;
- procedure AddRange(const AStart, AEnd : TUnicodeCodePoint);inline;
- procedure AddPattern(const APattern : UnicodeString);inline;overload;
- procedure AddPattern(const APattern : RawByteString);inline;overload;
- function CreateIterator() : TIterator;
- function Contains(const AString : array of TUnicodeCodePoint) : Boolean;overload;
- function Contains(const AChar : TUnicodeCodePoint) : Boolean;inline;overload;
- function Contains(const AChar : UnicodeChar) : Boolean;inline;overload;
- function Contains(const AChar : AnsiChar) : Boolean;inline;overload;
- function Contains(const AString : UnicodeString) : Boolean;overload;
- function Contains(const AString : RawByteString) : Boolean;overload;
- end;
- resourcestring
- SInvalidLength = 'Invalid length value : "%d".';
- SInvalidPosition = 'Invalid position : "%d".';
- SInvalidRangeLimits = 'Invalid range limits : ["%x" , "%x"].';
- SExpectedBut = 'Expects "%s" but got "%s..." .';
- SUnexpectedEOF = 'Unexpected end of file.';
- implementation
- uses
- unicodedata;
- function ToArray(const AItem : TUnicodeCodePoint) : TUnicodeCodePointArray;inline;
- begin
- SetLength(Result,1);
- Result[Low(Result)] := AItem;
- end;
- function CompareItem(const Item1, Item2 : TUnicodeCodePointArray): Integer;
- var
- a, b : ^TUnicodeCodePoint;
- i, ha, hb : Integer;
- begin
- if (Pointer(Item1) = Pointer(Item2)) then
- exit(0);
- if (Item1 = nil) then
- exit(-1);
- if (Item2 = nil) then
- exit(1);
- a := @Item1[0];
- b := @Item2[0];
- Result := 1;
- ha := Length(Item1) - 1;
- hb := Length(Item2) - 1;
- for i := 0 to ha do begin
- if (i > hb) then
- exit;
- if (a^ < b^) then
- exit(-1);
- if (a^ > b^) then
- exit(1);
- Inc(a);
- Inc(b);
- end;
- if (ha = hb) then
- exit(0);
- exit(-1);
- end;
- { TUnicodeCodePointArrayComparator }
- class function TUnicodeCodePointArrayComparator.Compare(const A, B : TUnicodeCodePointArray): Integer;
- begin
- Result := CompareItem(A,B);
- end;
- { TPatternParser }
- procedure TPatternParser.Error(const AMsg: string; const AArgs: array of const);
- begin
- raise EUnicodeSetException.CreateFmt(AMsg,AArgs);
- end;
- procedure TPatternParser.Error(const AMsg: string);
- begin
- raise EUnicodeSetException.Create(AMsg);
- end;
- procedure TPatternParser.SetBuffer(
- const APattern : PUnicodeChar;
- const ALength : Integer
- );
- begin
- FPosition := 0;
- if (ALength <= 1) then begin
- FBufferStr := '';
- FBuffer := nil;
- FBufferLength := 0;
- exit;
- end;
- FBufferLength := ALength;
- SetLength(FBufferStr,FBufferLength);
- FBuffer := @FBufferStr[1];
- Move(APattern^,FBuffer^,(FBufferLength*SizeOf(FBuffer^)));
- end;
- procedure TPatternParser.CheckEOF();
- begin
- CheckEOF(0);
- end;
- procedure TPatternParser.CheckEOF(ALength : Integer);
- begin
- if (ALength < 0) then
- Error(SInvalidLength,[ALength]);
- if ((FPosition+ALength) >= FBufferLength) then
- UnexpectedEOF();
- end;
- procedure TPatternParser.UnexpectedEOF();
- begin
- Error(SUnexpectedEOF);
- end;
- function TPatternParser.IsThis(AItem: UnicodeString; const APosition: Integer): Boolean;
- var
- i, k, c : Integer;
- begin
- if (APosition < 0) then
- Error(SInvalidPosition,[APosition]);
- Result := False;
- c := Length(AItem);
- if (c = 0) then
- exit;
- i := APosition;
- k := i + c;
- if (k >= FBufferLength) then
- exit;
- if CompareMem(@AItem[1], @FBuffer[APosition],c) then
- Result := True;
- end;
- function TPatternParser.IsThis(AItem : UnicodeString) : Boolean;
- begin
- Result := IsThis(AItem,FPosition);
- end;
- procedure TPatternParser.Expect(AItem: UnicodeString; const APosition: Integer);
- begin
- if not IsThis(AItem,APosition) then
- Error(SExpectedBut,[AItem,Copy(FBuffer,APosition,Length(AItem))]);
- end;
- procedure TPatternParser.Expect(AItem: UnicodeString);
- begin
- Expect(AItem,FPosition);
- end;
- procedure TPatternParser.SkipSpaces();
- begin
- while (FPosition < FBufferLength) do begin
- if (FBuffer[FPosition] <> ' ') then
- Break;
- Inc(FPosition);
- end;
- end;
- function TPatternParser.NextChar(): TUnicodeCodePoint;
- var
- i : Integer;
- c : UnicodeChar;
- cp : TUnicodeCodePoint;
- s : UnicodeString;
- begin
- SkipSpaces();
- CheckEOF();
- c := FBuffer[FPosition];
- cp := Ord(c);
- Inc(FPosition);
- if (c = '\') and (FPosition < FBufferLength) then begin
- if IsThis('\') then begin
- Inc(FPosition);
- CheckEOF();
- cp := Ord(FBuffer[FPosition]);
- Inc(FPosition);
- end else if IsThis('u') then begin
- Inc(FPosition);
- CheckEOF(4);
- s := Copy(FBufferStr,(FPosition+1),4);
- Inc(FPosition,4);
- if not TryStrToInt(string('$'+s),i) then
- Error(SExpectedBut,['\uXXXX',s]);
- cp := i;
- end;
- end;
- if (cp <= MAX_WORD) and UnicodeIsLowSurrogate(UnicodeChar(Word(cp))) then begin
- SkipSpaces();
- CheckEOF();
- c := UnicodeChar(Word(cp));
- if UnicodeIsSurrogatePair(c,FBuffer[FPosition]) then begin
- cp := ToUCS4(c,FBuffer[FPosition]);
- Inc(FPosition);
- end;
- end;
- FSpecialChar := (cp = Ord('{')) or (cp = Ord('}'));
- Result := cp;
- end;
- function CompareTo(const A : TUnicodeCodePoint; const B : UnicodeChar) : Boolean;inline;
- begin
- Result := (A = Ord(B));
- end;
- procedure TPatternParser.ParseItem();
- var
- cp, lastCp : TUnicodeCodePoint;
- charCount, k : Integer;
- cpa : TUnicodeCodePointArray;
- begin
- SkipSpaces();
- Expect('[');
- charCount := 0;
- Inc(FPosition);
- cp:=0;
- while (FPosition < FBufferLength) do begin
- lastCp := cp;
- cp := NextChar();
- if CompareTo(cp,']') then
- Break;
- if SpecialChar and (cp = Ord('{')) then begin
- SetLength(cpa,12);
- k := 0;
- while True do begin
- cp := NextChar();
- if SpecialChar and (cp = Ord('}')) then
- break;
- if (k >= Length(cpa)) then
- SetLength(cpa,(2*k));
- cpa[k] := cp;
- k := k+1;
- end;
- if (k > 0) then begin
- SetLength(cpa,k);
- FSet.Add(cpa);
- end;
- end else begin
- if CompareTo(cp,'-') then begin
- if (charCount = 0) then
- Error(SExpectedBut,['<char>','-']);
- cp := NextChar();
- FSet.AddRange(lastCp,cp);
- end else begin
- FSet.Add(cp);
- end;
- end;
- Inc(charCount);
- end;
- end;
- procedure TPatternParser.DoParse();
- begin
- SkipSpaces();
- while (FPosition < FBufferLength) do begin
- ParseItem();
- SkipSpaces();
- end;
- end;
- procedure TPatternParser.Parse(const APattern: PUnicodeChar; const ALength: Integer);
- begin
- if (ALength < 2) then
- exit;
- SetBuffer(APattern,ALength);
- DoParse();
- end;
- procedure TPatternParser.Parse(const APattern : UnicodeString);
- begin
- Parse(@APattern[1],Length(APattern));
- end;
- { TUnicodeSet }
- procedure TUnicodeSet.CreateParser();
- begin
- if (FParser = nil) then begin
- FParser := TPatternParser.Create();
- FParser.CurrentSet := Self;
- end;
- end;
- function TUnicodeSet.InternalContains(const AString: UnicodeString): Boolean;
- var
- u4 : UCS4String;
- c, i : Integer;
- cpa : TUnicodeCodePointArray;
- begin
- u4 := UnicodeStringToUCS4String(AString);
- c := Length(u4)-1;
- if (c = 1) then
- exit(Contains(u4[0]));
- SetLength(cpa,c);
- for i := 0 to c-1 do
- cpa[i] := u4[i];
- Result := Contains(cpa);
- end;
- constructor TUnicodeSet.Create;
- begin
- FTree := TTree.Create();
- end;
- destructor TUnicodeSet.Destroy;
- begin
- FParser.Free();
- FTree.Free();
- inherited Destroy;
- end;
- procedure TUnicodeSet.Add(AChar: TUnicodeCodePoint);
- begin
- FTree.Insert(ToArray(AChar));
- end;
- procedure TUnicodeSet.Add(AString: TUnicodeCodePointArray);
- begin
- if (AString <> nil) then
- FTree.Insert(AString);
- end;
- procedure TUnicodeSet.AddRange(const AStart, AEnd : TUnicodeCodePoint);
- var
- i : Integer;
- begin
- if (AStart > AEnd) then
- raise EUnicodeSetException.CreateFmt(SInvalidRangeLimits,[AStart,AEnd]);
- for i := AStart to AEnd do
- Add(i);
- end;
- procedure TUnicodeSet.AddPattern(const APattern : UnicodeString);
- begin
- CreateParser();
- FParser.Parse(APattern);
- end;
- procedure TUnicodeSet.AddPattern(const APattern: RawByteString);
- var
- us : UnicodeString;
- begin
- us := UnicodeString(APattern);
- AddPattern(us);
- end;
- function TUnicodeSet.CreateIterator() : TIterator;
- begin
- Result := FTree.CreateForwardIterator();
- end;
- function TUnicodeSet.Contains(const AString : array of TUnicodeCodePoint) : Boolean;
- var
- c : Integer;
- x : TUnicodeCodePointArray;
- begin
- Result := False;
- c := Length(AString);
- if (c = 0) then
- exit;
- SetLength(x,c);
- Move(AString[Low(AString)],x[Low(x)],(c*SizeOf(x[0])));
- if (FTree.FindNode(x) <> nil) then
- Result := True;
- end;
- function TUnicodeSet.Contains(const AChar : TUnicodeCodePoint) : Boolean;
- begin
- Result := Contains([AChar]);
- end;
- function TUnicodeSet.Contains(const AChar : UnicodeChar) : Boolean;
- begin
- Result := Contains(TUnicodeCodePoint(Ord(AChar)));
- end;
- function TUnicodeSet.Contains(const AChar : AnsiChar) : Boolean;
- begin
- Result := Contains(TUnicodeCodePoint(Ord(AChar)));
- end;
- function TUnicodeSet.Contains(const AString: UnicodeString): Boolean;
- begin
- if (AString = '') then
- exit(Contains([]));
- if (Length(AString) = 1) then
- exit(Contains(AString[1]));
- Result := InternalContains(AString);
- end;
- function TUnicodeSet.Contains(const AString: RawByteString): Boolean;
- var
- us : UnicodeString;
- begin
- us := UnicodeString(AString);
- Result := Contains(us);
- end;
- end.
|