123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606 |
- # This set of tests is for UTF-16 and UTF-32 support, including Unicode
- # properties. It is relevant only to the 16-bit and 32-bit libraries. The
- # output is different for each library, so there are separate output files.
- /ÃÃÃxxx/IB,utf,no_utf_check
- /abc/utf
- Ã]
- # Check maximum character size
- /\x{ffff}/IB,utf
- /\x{10000}/IB,utf
- /\x{100}/IB,utf
- /\x{1000}/IB,utf
- /\x{10000}/IB,utf
- /\x{100000}/IB,utf
- /\x{10ffff}/IB,utf
- /[\x{ff}]/IB,utf
- /[\x{100}]/IB,utf
- /\x80/IB,utf
- /\xff/IB,utf
- /\x{D55c}\x{ad6d}\x{C5B4}/IB,utf
- \x{D55c}\x{ad6d}\x{C5B4}
- /\x{65e5}\x{672c}\x{8a9e}/IB,utf
- \x{65e5}\x{672c}\x{8a9e}
- /\x{80}/IB,utf
- /\x{084}/IB,utf
- /\x{104}/IB,utf
- /\x{861}/IB,utf
- /\x{212ab}/IB,utf
- /[^ab\xC0-\xF0]/IB,utf
- \x{f1}
- \x{bf}
- \x{100}
- \x{1000}
- \= Expect no match
- \x{c0}
- \x{f0}
- /Ä€{3,4}/IB,utf
- \x{100}\x{100}\x{100}\x{100\x{100}
- /(\x{100}+|x)/IB,utf
- /(\x{100}*a|x)/IB,utf
- /(\x{100}{0,2}a|x)/IB,utf
- /(\x{100}{1,2}a|x)/IB,utf
- /\x{100}/IB,utf
- /a\x{100}\x{101}*/IB,utf
- /a\x{100}\x{101}+/IB,utf
- /[^\x{c4}]/IB
- /[\x{100}]/IB,utf
- \x{100}
- Z\x{100}
- \x{100}Z
- /[\xff]/IB,utf
- >\x{ff}<
- /[^\xff]/IB,utf
- /\x{100}abc(xyz(?1))/IB,utf
- /\777/I,utf
- \x{1ff}
- \777
- /\x{100}+\x{200}/IB,utf
- /\x{100}+X/IB,utf
- /^[\QĀ\E-\Q�\E/B,utf
- /X/utf
- XX\x{d800}\=no_utf_check
- XX\x{da00}\=no_utf_check
- XX\x{dc00}\=no_utf_check
- XX\x{de00}\=no_utf_check
- XX\x{dfff}\=no_utf_check
- \= Expect UTF error
- XX\x{d800}
- XX\x{da00}
- XX\x{dc00}
- XX\x{de00}
- XX\x{dfff}
- XX\x{110000}
- XX\x{d800}\x{1234}
- \= Expect no match
- XX\x{d800}\=offset=3
-
- /(?<=.)X/utf
- XX\x{d800}\=offset=3
- /(*UTF16)\x{11234}/
- abcd\x{11234}pqr
- /(*UTF)\x{11234}/I
- abcd\x{11234}pqr
- /(*UTF-32)\x{11234}/
- abcd\x{11234}pqr
- /(*UTF-32)\x{112}/
- abcd\x{11234}pqr
- /(*CRLF)(*UTF16)(*BSR_UNICODE)a\Rb/I
- /(*CRLF)(*UTF32)(*BSR_UNICODE)a\Rb/I
- /\h/I,utf
- ABC\x{09}
- ABC\x{20}
- ABC\x{a0}
- ABC\x{1680}
- ABC\x{180e}
- ABC\x{2000}
- ABC\x{202f}
- ABC\x{205f}
- ABC\x{3000}
- /\v/I,utf
- ABC\x{0a}
- ABC\x{0b}
- ABC\x{0c}
- ABC\x{0d}
- ABC\x{85}
- ABC\x{2028}
- /\h*A/I,utf
- CDBABC
- \x{2000}ABC
- /\R*A/I,bsr=unicode,utf
- CDBABC
- \x{2028}A
- /\v+A/I,utf
- /\s?xxx\s/I,utf
- /\sxxx\s/I,utf,tables=2
- AB\x{85}xxx\x{a0}XYZ
- AB\x{a0}xxx\x{85}XYZ
- /\S \S/I,utf,tables=2
- \x{a2} \x{84}
- A Z
- /a+/utf
- a\x{123}aa\=offset=1
- a\x{123}aa\=offset=2
- a\x{123}aa\=offset=3
- \= Expect no match
- a\x{123}aa\=offset=4
- \= Expect bad offset error
- a\x{123}aa\=offset=5
- a\x{123}aa\=offset=6
- /\x{1234}+/Ii,utf
- /\x{1234}+?/Ii,utf
- /\x{1234}++/Ii,utf
- /\x{1234}{2}/Ii,utf
- /[^\x{c4}]/IB,utf
- /X+\x{200}/IB,utf
- /\R/I,utf
- # Check bad offset
- /a/utf
- \= Expect bad UTF-16 offset, or no match in 32-bit
- \x{10000}\=offset=1
- \x{10000}ab\=offset=1
- \= Expect 16-bit match, 32-bit no match
- \x{10000}ab\=offset=2
- \= Expect no match
- \x{10000}ab\=offset=3
- \= Expect no match in 16-bit, bad offset in 32-bit
- \x{10000}ab\=offset=4
- \= Expect bad offset
- \x{10000}ab\=offset=5
- /í¼€/utf
- /\w+\x{C4}/B,utf
- a\x{C4}\x{C4}
- /\w+\x{C4}/B,utf,tables=2
- a\x{C4}\x{C4}
-
- /\W+\x{C4}/B,utf
- !\x{C4}
-
- /\W+\x{C4}/B,utf,tables=2
- !\x{C4}
- /\W+\x{A1}/B,utf
- !\x{A1}
-
- /\W+\x{A1}/B,utf,tables=2
- !\x{A1}
- /X\s+\x{A0}/B,utf
- X\x20\x{A0}\x{A0}
- /X\s+\x{A0}/B,utf,tables=2
- X\x20\x{A0}\x{A0}
- /\S+\x{A0}/B,utf
- X\x{A0}\x{A0}
- /\S+\x{A0}/B,utf,tables=2
- X\x{A0}\x{A0}
- /\x{a0}+\s!/B,utf
- \x{a0}\x20!
- /\x{a0}+\s!/B,utf,tables=2
- \x{a0}\x20!
- /(*UTF)abc/never_utf
- /abc/utf,never_utf
- /A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IBi,utf
- /A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IB,utf
- /AB\x{1fb0}/IB,utf
- /AB\x{1fb0}/IBi,utf
- /\x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}/Ii,utf
- \x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}
- \x{451}\x{440}\x{441}\x{442}\x{443}\x{444}\x{445}\x{446}\x{447}\x{448}\x{449}\x{44a}\x{44b}\x{44c}\x{44d}\x{44e}\x{44f}
- /[â±¥]/Bi,utf
- /[^â±¥]/Bi,utf
- /[[:blank:]]/B,ucp
- /\x{212a}+/Ii,utf
- KKkk\x{212a}
- /s+/Ii,utf
- SSss\x{17f}
- # Non-UTF characters should give errors in both 16-bit and 32-bit modes.
- /\x{110000}/utf
- /\o{4200000}/utf
- /\x{100}*A/IB,utf
- A
- /\x{100}*\d(?R)/IB,utf
- /[Z\x{100}]/IB,utf
- Z\x{100}
- \x{100}
- \x{100}Z
- /[z-\x{100}]/IB,utf
- /[z\Qa-d]Ä€\E]/IB,utf
- \x{100}
- Ā
- /[ab\x{100}]abc(xyz(?1))/IB,utf
- /\x{100}*\s/IB,utf
- /\x{100}*\d/IB,utf
- /\x{100}*\w/IB,utf
- /\x{100}*\D/IB,utf
- /\x{100}*\S/IB,utf
- /\x{100}*\W/IB,utf
- /[\x{105}-\x{109}]/IBi,utf
- \x{104}
- \x{105}
- \x{109}
- \= Expect no match
- \x{100}
- \x{10a}
-
- /[z-\x{100}]/IBi,utf
- Z
- z
- \x{39c}
- \x{178}
- |
- \x{80}
- \x{ff}
- \x{100}
- \x{101}
- \= Expect no match
- \x{102}
- Y
- y
- /[z-\x{100}]/IBi,utf
- /\x{3a3}B/IBi,utf
- /./utf
- \x{110000}
- /(*UTF)abý¿¿¿¿¿z/B
- /abý¿¿¿¿¿z/utf
- /[\W\p{Any}]/B
- abc
- 123
- /[\W\pL]/B
- abc
- \x{100}
- \x{308}
- \= Expect no match
- 123
- /[\s[:^ascii:]]/B,ucp
- /\pP/ucp
- \x{7fffffff}
- # A special extra option allows excaped surrogate code points in 32-bit mode,
- # but subjects containing them must not be UTF-checked. These patterns give
- # errors in 16-bit mode.
- /\x{d800}/I,utf,allow_surrogate_escapes
- \x{d800}\=no_utf_check
- /\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes
- \x{dfff}\x{df01}\=no_utf_check
- # This has different starting code units in 8-bit mode.
- /^[^ab]/IB,utf
- c
- \x{ff}
- \x{100}
- \= Expect no match
- aaa
-
- # Offsets are different in 8-bit mode.
- /(?<=abc)(|def)/g,utf,replace=<$0>,substitute_callout
- 123abcáyzabcdef789abcሴqr
-
- # A few script run tests in non-UTF mode (but they need Unicode support)
- /^(*script_run:.{4})/
- \x{3041}\x{30a1}\x{3007}\x{3007} Hiragana Katakana Han Han
- \x{30a1}\x{3041}\x{3007}\x{3007} Katakana Hiragana Han Han
- \x{1100}\x{2e80}\x{2e80}\x{1101} Hangul Han Han Hangul
-
- /^(*sr:.*)/utf,allow_surrogate_escapes
- \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
- \x{d800}\x{dfff} Surrogates (Unknown) \=no_utf_check
- /(?(n/utf
- /(?(á/utf
- # Invalid UTF-16/32 tests.
- /.../g,match_invalid_utf
- abcd\x{df00}wxzy\x{df00}pqrs
- abcd\x{80}wxzy\x{df00}pqrs
- /abc/match_invalid_utf
- ab\x{df00}ab\=ph
- \= Expect no match
- ab\x{df00}cdef\=ph
- /.a/match_invalid_utf
- ab\=ph
- ab\=ps
- \= Expect no match
- b\x{df00}\=ph
- b\x{df00}\=ps
- /.a$/match_invalid_utf
- ab\=ph
- ab\=ps
- \= Expect no match
- b\x{df00}\=ph
- b\x{df00}\=ps
- /ab$/match_invalid_utf
- ab\x{df00}cdeab
- \= Expect no match
- ab\x{df00}cde
- /.../g,match_invalid_utf
- abcd\x{80}wxzy\x{df00}pqrs
- /(?<=x)../g,match_invalid_utf
- abcd\x{80}wxzy\x{df00}pqrs
- abcd\x{80}wxzy\x{df00}xpqrs
- /X$/match_invalid_utf
- \= Expect no match
- X\x{df00}
-
- /(?<=..)X/match_invalid_utf,aftertext
- AB\x{df00}AQXYZ
- AB\x{df00}AQXYZ\=offset=5
- AB\x{df00}\x{df00}AXYZXC\=offset=5
- \= Expect no match
- AB\x{df00}XYZ
- AB\x{df00}XYZ\=offset=3
- AB\x{df00}AXYZ
- AB\x{df00}AXYZ\=offset=4
- AB\x{df00}\x{df00}AXYZ\=offset=5
- /.../match_invalid_utf
- \= Expect no match
- A\x{d800}B
- A\x{110000}B
-
- /aa/utf,ucp,match_invalid_utf,global
- aa\x{d800}aa
- /aa/utf,ucp,match_invalid_utf,global
- \x{d800}aa
-
- /A\z/utf,match_invalid_utf
- A\x{df00}\n
- # ----------------------------------------------------
- /(*UTF)(?=\x{123})/I
- /[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf
- /[\xff\x{ffff}]/I,utf
- /[\xff\x{ff}]/I,utf
- /[\xff\x{ff}]/I
- /[Ss]/I
- /[Ss]/I,utf
- /(?:\x{ff}|\x{3000})/I,utf
- # ----------------------------------------------------
- # UCP and casing tests
- /\x{120}/i,I
- /\x{c1}/i,I,ucp
- /[\x{120}\x{121}]/iB,ucp
- /[ab\x{120}]+/iB,ucp
- aABb\x{121}\x{120}
- /\x{c1}/i,no_start_optimize
- \= Expect no match
- \x{e1}
- /\x{120}\x{c1}/i,ucp,no_start_optimize
- \x{121}\x{e1}
- /\x{120}\x{c1}/i,ucp
- \x{121}\x{e1}
- /[^\x{120}]/i,no_start_optimize
- \x{121}
- /[^\x{120}]/i,ucp,no_start_optimize
- \= Expect no match
- \x{121}
- /[^\x{120}]/i
- \x{121}
- /[^\x{120}]/i,ucp
- \= Expect no match
- \x{121}
-
- /\x{120}{2}/i,ucp
- \x{121}\x{121}
- /[^\x{120}]{2}/i,ucp
- \= Expect no match
- \x{121}\x{121}
- /\x{c1}+\x{e1}/iB,ucp
- \x{c1}\x{c1}\x{c1}
- /\x{c1}+\x{e1}/iIB,ucp
- \x{c1}\x{c1}\x{c1}
- \x{e1}\x{e1}\x{e1}
- /a|\x{c1}/iI,ucp
- \x{e1}xxx
- /\x{c1}|\x{e1}/iI,ucp
- /X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
- X\x{e1}Y
- /X(\x{121})Y/ucp,replace=>\U$1<,substitute_extended
- X\x{121}Y
- /s/i,ucp
- \x{17f}
- /s/i,utf
- \x{17f}
- /[^s]/i,ucp
- \= Expect no match
- \x{17f}
- /[^s]/i,utf
- \= Expect no match
- \x{17f}
- # ----------------------------------------------------
- # Quantifier after a literal that has the value of META_ACCEPT (not UTF). This
- # fails in 16-bit mode, but is OK for 32-bit.
- /\x{802a0000}*/
- \x{802a0000}\x{802a0000}
- # UTF matching without UTF, check invalid UTF characters
- /\X++/
- a\x{110000}\x{ffffffff}
- # This used to loop in 32-bit mode; it will fail in 16-bit mode.
- /[\x{ffffffff}]/caseless,ucp
- \x{ffffffff}xyz
-
- # These are 32-bit tests for handing 0xffffffff when in UCP caselsss mode. They
- # will give errors in 16-bit mode.
- /k*\x{ffffffff}/caseless,ucp
- \x{ffffffff}
- /k+\x{ffffffff}/caseless,ucp,no_start_optimize
- K\x{ffffffff}
- \= Expect no match
- \x{ffffffff}\x{ffffffff}
- /k{2}\x{ffffffff}/caseless,ucp,no_start_optimize
- \= Expect no match
- \x{ffffffff}\x{ffffffff}\x{ffffffff}
- /k\x{ffffffff}/caseless,ucp,no_start_optimize
- K\x{ffffffff}
- \= Expect no match
- \x{ffffffff}\x{ffffffff}\x{ffffffff}
- /k{2,}?Z/caseless,ucp,no_start_optimize,no_auto_possess
- \= Expect no match
- Kk\x{ffffffff}\x{ffffffff}\x{ffffffff}Z
- # ---------------------------------------------------------
- # End of testinput12
|