Browse Source

* synchronized with trunk

git-svn-id: branches/unicodekvm@48727 -
nickysn 4 years ago
parent
commit
4a56b9eaf8
1 changed files with 143 additions and 0 deletions
  1. 143 0
      packages/rtl-unicode/src/inc/graphemebreakproperty.pp

+ 143 - 0
packages/rtl-unicode/src/inc/graphemebreakproperty.pp

@@ -1,3 +1,33 @@
+{ GraphemeBreakProperty Unicode data unit.
+
+  Copyright (C) 2021 Nikolay Nikolov <[email protected]>
+
+  This library is free software; you can redistribute it and/or modify it
+  under the terms of the GNU Library General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or (at your
+  option) any later version with the following modification:
+
+  As a special exception, the copyright holders of this library give you
+  permission to link this library with independent modules to produce an
+  executable, regardless of the license terms of these independent modules,and
+  to copy and distribute the resulting executable under terms of your choice,
+  provided that you also meet, for each linked independent module, the terms
+  and conditions of the license of that module. An independent module is a
+  module which is not derived from or based on this library. If you modify
+  this library, you may extend this exception to your version of the library,
+  but you are not obligated to do so. If you do not wish to do so, delete this
+  exception statement from your version.
+
+  This program is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+  FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License
+  for more details.
+
+  You should have received a copy of the GNU Library General Public License
+  along with this library; if not, write to the Free Software Foundation,
+  Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1335, USA.
+}
+
 unit graphemebreakproperty;
 
 {$MODE objfpc}
@@ -25,6 +55,29 @@ type
     gbpGlue_After_Zwj,
     gbpE_Base_GAZ);
 
+  { TUnicodeStringExtendedGraphemeClustersEnumerator }
+
+  TUnicodeStringExtendedGraphemeClustersEnumerator = class
+  private
+    FStr: UnicodeString;
+    FCurrentIndexStart: SizeInt;
+    FCurrentIndexEnd: SizeInt;
+    FNextIndexEnd: SizeInt;
+    FNextGBP: TGraphemeBreakProperty;
+    FNextCodePoint: UCS4Char;
+    FCurrentGBP: TGraphemeBreakProperty;
+    FCurrentCodePoint: UCS4Char;
+    FRI_Sequence_Length: Integer;
+    FE_Base_EBG_Extend_Sequence: Boolean;
+    function GetCurrent: UnicodeString;
+    procedure FetchNextChar;
+  public
+    constructor Create(const S: UnicodeString);
+    function GetEnumerator: TUnicodeStringExtendedGraphemeClustersEnumerator;
+    function MoveNext: Boolean;
+    property Current: UnicodeString read GetCurrent;
+  end;
+
 function GetGraphemeBreakProperty(Ch: UCS4Char): TGraphemeBreakProperty;
 
 implementation
@@ -34,4 +87,94 @@ begin
   {$I graphemebreakproperty_code.inc}
 end;
 
+{ TUnicodeStringExtendedGraphemeClustersEnumerator }
+
+function TUnicodeStringExtendedGraphemeClustersEnumerator.GetCurrent: UnicodeString;
+begin
+  Result := Copy(FStr, FCurrentIndexStart, FCurrentIndexEnd - FCurrentIndexStart + 1);
+end;
+
+procedure TUnicodeStringExtendedGraphemeClustersEnumerator.FetchNextChar;
+begin
+  Inc(FNextIndexEnd);
+  if FNextIndexEnd <= Length(FStr) then
+  begin
+    FNextCodePoint := Ord(FStr[FNextIndexEnd]);
+    { high surrogate, followed by low surrogate? }
+    if (FNextCodePoint >= $D800) and (FNextCodePoint <= $DBFF) and ((FNextIndexEnd + 1) <= Length(FStr)) and
+       (Ord(FStr[FNextIndexEnd + 1]) >= $DC00) and (Ord(FStr[FNextIndexEnd + 1]) <= $DFFF) then
+    begin
+      Inc(FNextIndexEnd);
+      FNextCodePoint := $10000 + (((FNextCodePoint - $D800) shl 10) or (Ord(FStr[FNextIndexEnd]) - $DC00));
+    end;
+  end
+  else
+    FNextCodePoint := 0;
+  FNextGBP := GetGraphemeBreakProperty(FNextCodePoint);
+end;
+
+constructor TUnicodeStringExtendedGraphemeClustersEnumerator.Create(const S: UnicodeString);
+begin
+  FStr := S;
+  FCurrentIndexStart := 0;
+  FCurrentIndexEnd := 0;
+  FNextIndexEnd := 0;
+  FRI_Sequence_Length := 0;
+  FE_Base_EBG_Extend_Sequence := False;
+  FetchNextChar;
+end;
+
+function TUnicodeStringExtendedGraphemeClustersEnumerator.GetEnumerator: TUnicodeStringExtendedGraphemeClustersEnumerator;
+begin
+  Result := Self;
+end;
+
+function TUnicodeStringExtendedGraphemeClustersEnumerator.MoveNext: Boolean;
+begin
+  FCurrentIndexStart := FCurrentIndexEnd + 1;
+  if FCurrentIndexStart > Length(FStr) then
+    Exit(false);
+  repeat
+    FCurrentGBP := FNextGBP;
+    FCurrentCodePoint := FNextCodePoint;
+    FCurrentIndexEnd := FNextIndexEnd;
+    if FCurrentGBP = gpbRegional_Indicator then
+      Inc(FRI_Sequence_Length)
+    else
+      FRI_Sequence_Length := 0;
+    FE_Base_EBG_Extend_Sequence := (FCurrentGBP in [gbpE_Base, gbpE_Base_GAZ]) or (FE_Base_EBG_Extend_Sequence and (FCurrentGBP = gbpExtend));
+    FetchNextChar;
+    if FNextIndexEnd > Length(FStr) then
+      Exit(True);
+
+    { Do not break between a CR and LF. Otherwise, break before and after controls. }
+    if (FCurrentGBP = gbpCR) and (FNextGBP = gbpLF) then
+      continue
+    else if (FCurrentGBP in [gbpControl, gbpCR, gbpLF]) or (FNextGBP in [gbpControl, gbpCR, gbpLF]) then
+      Exit(True)
+    { Do not break Hangul syllable sequences. }
+    else if ((FCurrentGBP = gbpL) and (FNextGBP in [gbpL, gbpV, gbpLV, gbpLVT])) or
+            ((FCurrentGBP in [gbpLV, gbpV]) and (FNextGBP in [gbpV, gbpT])) or
+            ((FCurrentGBP in [gbpLVT, gbpT]) and (FNextGBP = gbpT)) then
+      continue
+    { Do not break before extending characters or ZWJ. }
+    else if FNextGBP in [gbpExtend, gbpZWJ] then
+      continue
+    { Only for extended grapheme clusters:
+      Do not break before SpacingMarks, or after Prepend characters. }
+    else if (FCurrentGBP = gbpPrepend) or (FNextGBP = gbpSpacingMark) then
+      continue
+    { Do not break within emoji modifier sequences or emoji zwj sequences. }
+    else if ((FCurrentGBP = gbpZWJ) and (FNextGBP in [gbpGlue_After_Zwj, gbpE_Base_GAZ])) or
+            (FE_Base_EBG_Extend_Sequence and (FNextGBP = gbpE_Modifier)) then
+      continue
+    { Do not break within emoji flag sequences. That is, do not break between regional indicator (RI) symbols if there is an odd number of RI characters before the break point. }
+    else if (FCurrentGBP = gpbRegional_Indicator) and (FNextGBP = gpbRegional_Indicator) and Odd(FRI_Sequence_Length) then
+      continue
+    { Otherwise, break everywhere. }
+    else
+      Exit(True);
+  until False;
+end;
+
 end.