textstream.bmx 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595
  1. SuperStrict
  2. Rem
  3. bbdoc: Streams/Text streams
  4. about:
  5. The Text Stream module allows you to load and save text in a number
  6. of formats: LATIN1, UTF8 and UTF16.
  7. The LATIN1 format uses a single byte to represent each character, and
  8. is therefore only capable of manipulating 256 distinct character values.
  9. The UTF8 and UTF16 formats are capable of manipulating up to 1114112
  10. character values, but will generally use greater storage space. In addition,
  11. many text processing applications are unable to handle UTF8 and UTF16 files.
  12. End Rem
  13. Module BRL.TextStream
  14. ModuleInfo "Version: 1.05"
  15. ModuleInfo "Author: Mark Sibly"
  16. ModuleInfo "License: zlib/libpng"
  17. ModuleInfo "Copyright: Blitz Research Ltd"
  18. ModuleInfo "Modserver: BRL"
  19. ModuleInfo "History: 1.05"
  20. ModuleInfo "History: UCS-2 surrogate pairs."
  21. ModuleInfo "History: 1.04"
  22. ModuleInfo "History: Module is now SuperStrict"
  23. ModuleInfo "History: 1.03 Release"
  24. ModuleInfo "History: Modified LoadText to handle stream URLs"
  25. ModuleInfo "History: 1.02 Release"
  26. ModuleInfo "History: Added LoadText, SaveText"
  27. ModuleInfo "History: Fixed UTF16LE=4"
  28. ModuleInfo "History: 1.01 Release"
  29. ModuleInfo "History: 1.00 Release"
  30. ModuleInfo "History: Added TextStream module"
  31. Import BRL.Stream
  32. Enum ETextStreamFormat
  33. LATIN1
  34. UTF8
  35. UTF16BE
  36. UTF16LE
  37. End Enum
  38. Type TTextStream Extends TStreamWrapper
  39. ' deprecated
  40. Const LATIN1:Int = 1
  41. Const UTF8:Int = 2
  42. Const UTF16BE:Int = 3
  43. Const UTF16LE:Int = 4
  44. '***** PUBLIC *****
  45. Method Read:Long( buf:Byte Ptr,count:Long ) Override
  46. For Local i:Long=0 Until count
  47. If _bufcount=32 _FlushRead
  48. Local hi:Int=_ReadByte()
  49. Local lo:Int=_ReadByte()
  50. hi:-48;If hi>9 hi:-7
  51. lo:-48;If lo>9 lo:-7
  52. buf[i]=hi Shl 4 | lo
  53. _bufcount:+1
  54. Next
  55. Return count
  56. End Method
  57. Method Write:Long( buf:Byte Ptr,count:Long ) Override
  58. For Local i:Long=0 Until count
  59. Local hi:Int=buf[i] Shr 4
  60. Local lo:Int=buf[i] & $f
  61. hi:+48;If hi>57 hi:+7
  62. lo:+48;If lo>57 lo:+7
  63. _WriteByte hi
  64. _WriteByte lo
  65. _bufcount:+1
  66. If _bufcount=32 _FlushWrite
  67. Next
  68. Return count
  69. End Method
  70. Method ReadByte:Int() Override
  71. _FlushRead
  72. Return Int( ReadLine() )
  73. End Method
  74. Method WriteByte( n:Int ) Override
  75. _FlushWrite
  76. WriteLine n
  77. End Method
  78. Method ReadShort:Int() Override
  79. _FlushRead
  80. Return Int( ReadLine() )
  81. End Method
  82. Method WriteShort( n:Int ) Override
  83. _FlushWrite
  84. WriteLine n
  85. End Method
  86. Method ReadInt:Int() Override
  87. _FlushRead
  88. Return Int( ReadLine() )
  89. End Method
  90. Method WriteInt( n:Int ) Override
  91. _FlushWrite
  92. WriteLine n
  93. End Method
  94. Method ReadLong:Long() Override
  95. _FlushRead
  96. Return Long( ReadLine() )
  97. End Method
  98. Method WriteLong( n:Long ) Override
  99. _FlushWrite
  100. WriteLine n
  101. End Method
  102. Method ReadFloat:Float() Override
  103. _FlushRead
  104. Return Float( ReadLine() )
  105. End Method
  106. Method WriteFloat( n:Float ) Override
  107. _FlushWrite
  108. WriteLine n
  109. End Method
  110. Method ReadDouble:Double() Override
  111. _FlushRead
  112. Return Double( ReadLine() )
  113. End Method
  114. Method WriteDouble( n:Double ) Override
  115. _FlushWrite
  116. WriteLine n
  117. End Method
  118. Method ReadLine:String() Override
  119. _FlushRead
  120. Local buf:Short[1024],i:Int
  121. While Not Eof()
  122. Local n:Int=ReadChar()
  123. If n=0 Exit
  124. If n=10 Exit
  125. If n=13 Continue
  126. If buf.length=i buf=buf[..i+1024]
  127. buf[i]=n
  128. i:+1
  129. Wend
  130. Return String.FromShorts(buf,i)
  131. End Method
  132. Method ReadFile:String()
  133. _FlushRead
  134. Local buf:Short[1024],i:Int
  135. While Not Eof()
  136. Local n:Int=ReadChar()
  137. If buf.length=i buf=buf[..i+1024]
  138. buf[i]=n
  139. i:+1
  140. Wend
  141. Return String.FromShorts( buf,i )
  142. End Method
  143. Method WriteLine:Int( str:String ) Override
  144. _FlushWrite
  145. WriteString str
  146. WriteString "~r~n"
  147. End Method
  148. Method ReadString:String( length:Int ) Override
  149. _FlushRead
  150. Local buf:Short[length]
  151. For Local i:Int=0 Until length
  152. buf[i]=ReadChar()
  153. Next
  154. Return String.FromShorts(buf,length)
  155. End Method
  156. Method WriteString( str:String ) Override
  157. _FlushWrite
  158. For Local i:Int=0 Until str.length
  159. WriteChar str[i]
  160. Next
  161. End Method
  162. Method ReadChar:Int()
  163. Local c:Int
  164. If _carried Then
  165. c = _carried
  166. _carried = 0
  167. Return c
  168. End If
  169. c = _ReadByte()
  170. Select _encoding
  171. Case ETextStreamFormat.LATIN1
  172. Return c
  173. Case ETextStreamFormat.UTF8
  174. If c<128 Return c
  175. Local d:Int=_ReadByte() & $3f
  176. If c<224 Return ((c & 31) Shl 6) | d
  177. Local e:Int=_ReadByte() & $3f
  178. If c<240 Return ((c & 15) Shl 12) | (d Shl 6) | e
  179. Local f:Int = _ReadByte() & $3f
  180. Local v:Int = ((c & 7) Shl 18) | (d Shl 12) | (e Shl 6) | f
  181. If v & $ffff0000 Then
  182. v :- $10000
  183. d = ((v Shr 10) & $7ffff) + $d800
  184. e = (v & $3ff) + $dc00
  185. _carried = e
  186. Return d
  187. Else
  188. Return v
  189. End If
  190. Case ETextStreamFormat.UTF16BE
  191. Local d:Int=_ReadByte()
  192. Return c Shl 8 | d
  193. Case ETextStreamFormat.UTF16LE
  194. Local d:Int=_ReadByte()
  195. Return d Shl 8 | c
  196. End Select
  197. End Method
  198. Method WriteChar( char:Int )
  199. If _carried Then
  200. Local c:Int = ((_carried - $d800) Shl 10) + (char - $dc00) + $10000
  201. _WriteByte (c Shr 18) | $f0
  202. _WriteByte ((c Shr 12) & $3f) | $80
  203. _WriteByte ((c Shr 6) & $3f) | $80
  204. _WriteByte (c & $3f) | $80
  205. _carried = 0
  206. Return
  207. End If
  208. Assert char>=0 And char<=$ffff
  209. Select _encoding
  210. Case ETextStreamFormat.LATIN1
  211. _WriteByte char
  212. Case ETextStreamFormat.UTF8
  213. If char<128
  214. _WriteByte char
  215. Else If char<2048
  216. _WriteByte char/64 | 192
  217. _WriteByte char Mod 64 | 128
  218. Else If char < $d800 Or char > $dbff
  219. _WriteByte char/4096 | 224
  220. _WriteByte char/64 Mod 64 | 128
  221. _WriteByte char Mod 64 | 128
  222. Else
  223. _carried = char
  224. Return
  225. EndIf
  226. Case ETextStreamFormat.UTF16BE
  227. _WriteByte char Shr 8
  228. _WriteByte char
  229. Case ETextStreamFormat.UTF16LE
  230. _WriteByte char
  231. _WriteByte char Shr 8
  232. End Select
  233. End Method
  234. Function Create:TTextStream( stream:TStream,encoding:Int )
  235. Local enc:ETextStreamFormat
  236. Select encoding
  237. Case LATIN1
  238. enc = ETextStreamFormat.LATIN1
  239. Case UTF8
  240. enc = ETextStreamFormat.UTF8
  241. Case UTF16BE
  242. enc = ETextStreamFormat.UTF16BE
  243. Case UTF16LE
  244. enc = ETextStreamFormat.UTF16LE
  245. End Select
  246. Return Create(stream, enc)
  247. End Function
  248. Function Create:TTextStream( stream:TStream,encoding:ETextStreamFormat )
  249. Local t:TTextStream=New TTextStream
  250. t._encoding=encoding
  251. t.SetStream stream
  252. Return t
  253. End Function
  254. '***** PRIVATE *****
  255. Method _ReadByte:Int()
  256. Return Super.ReadByte()
  257. End Method
  258. Method _WriteByte( n:Int )
  259. Super.WriteByte n
  260. End Method
  261. Method _FlushRead()
  262. If Not _bufcount Return
  263. Local n:Int=_ReadByte()
  264. If n=13 n=_ReadByte()
  265. If n<>10 Throw "Malformed line terminator"
  266. _bufcount=0
  267. End Method
  268. Method _FlushWrite()
  269. If Not _bufcount Return
  270. _WriteByte 13
  271. _WriteByte 10
  272. _bufcount=0
  273. End Method
  274. Field _encoding:ETextStreamFormat
  275. Field _bufcount:Int
  276. Field _carried:Int
  277. End Type
  278. Type TTextStreamFactory Extends TStreamFactory
  279. Method CreateStream:TStream( url:Object,proto:String,path:String,readable:Int,writeMode:Int ) Override
  280. Local encoding:ETextStreamFormat
  281. Select proto
  282. Case "latin1"
  283. encoding=ETextStreamFormat.LATIN1
  284. Case "utf8"
  285. encoding=ETextStreamFormat.UTF8
  286. Case "utf16be"
  287. encoding=ETextStreamFormat.UTF16BE
  288. Case "utf16le"
  289. encoding=ETextStreamFormat.UTF16LE
  290. End Select
  291. If Not encoding Return Null
  292. Local stream:TStream=OpenStream( path,readable,writeMode )
  293. If stream Return TTextStream.Create( stream,encoding )
  294. End Method
  295. End Type
  296. New TTextStreamFactory
  297. Rem
  298. bbdoc: Load text from a stream
  299. returns: A string containing the text
  300. about:
  301. #LoadText loads LATIN1, UTF8 or UTF16 text from @url.
  302. The first bytes read from the stream control the format of the text:
  303. [ &$fe $ff | Text is big endian UTF16
  304. * &$ff $fe | Text is little endian UTF16
  305. * &$ef $bb $bf | Text is UTF8
  306. ]
  307. If the first bytes don't match any of the above values, the stream
  308. is assumed to contain LATIN1 text. Additionally, when @checkForUTF8 is enabled, the
  309. stream will be tested for UTF8 compatibility, and loaded as such as appropriate.
  310. A #TStreamReadException is thrown if not all bytes could be read.
  311. End Rem
  312. Function LoadText:String( url:Object, checkForUTF8:Int = True )
  313. Local stream:TStream=ReadStream( url )
  314. If Not stream Throw New TStreamReadException
  315. Local format:ETextStreamFormat
  316. Local size:Int,c:Int,d:Int,e:Int
  317. If Not stream.Eof()
  318. c=stream.ReadByte()
  319. size:+1
  320. If Not stream.Eof()
  321. d=stream.ReadByte()
  322. size:+1
  323. If c=$fe And d=$ff
  324. format=ETextStreamFormat.UTF16BE
  325. Else If c=$ff And d=$fe
  326. format=ETextStreamFormat.UTF16LE
  327. Else If c=$ef And d=$bb
  328. If Not stream.Eof()
  329. e=stream.ReadByte()
  330. size:+1
  331. If e=$bf format=ETextStreamFormat.UTF8
  332. EndIf
  333. EndIf
  334. EndIf
  335. EndIf
  336. If Not format
  337. Local data:Byte[1024]
  338. data[0]=c;data[1]=d;data[2]=e
  339. While Not stream.Eof()
  340. If size=data.length-1 data=data[..size*2]
  341. size:+stream.Read( (Byte Ptr data)+size,data.length-size-1 )
  342. Wend
  343. stream.Close
  344. If checkForUTF8 And IsProbablyUTF8(data, size) Then
  345. Return String.FromUTF8String(data)
  346. Else
  347. Return String.FromBytes( data,size )
  348. End If
  349. EndIf
  350. Local TStream:TTextStream=TTextStream.Create( stream,format )
  351. Local str:String=TStream.ReadFile()
  352. TStream.Close
  353. stream.Close
  354. Return str
  355. End Function
  356. Rem
  357. bbdoc: Save text to a stream
  358. about:
  359. #SaveText saves the characters in @str to @url.
  360. If @str contains any characters with a character code greater than 255,
  361. then @str is saved in UTF16 format. Otherwise, @str is saved in LATIN1 format.
  362. A #TStreamWriteException is thrown if not all bytes could be written.
  363. End Rem
  364. Function SaveText:Int( str:String,url:Object, format:ETextStreamFormat = ETextStreamFormat.LATIN1, withBOM:Int = True )
  365. If format <> ETextStreamFormat.LATIN1 And format <> ETextStreamFormat.UTF8
  366. For Local i:Int=0 Until str.length
  367. If str[i]>255
  368. ?BigEndian
  369. format=ETextStreamFormat.UTF16BE
  370. ?LittleEndian
  371. format=ETextStreamFormat.UTF16LE
  372. ?
  373. Exit
  374. EndIf
  375. Next
  376. End If
  377. If format = ETextStreamFormat.LATIN1
  378. SaveString str,url
  379. Return True
  380. EndIf
  381. Local stream:TStream=WriteStream( url )
  382. If Not stream Throw New TStreamWriteException
  383. If withBOM Then
  384. Select format
  385. Case ETextStreamFormat.UTF8
  386. stream.WriteByte $ef
  387. stream.WriteByte $bb
  388. stream.WriteByte $bf
  389. Case ETextStreamFormat.UTF16BE
  390. stream.WriteByte $fe
  391. stream.WriteByte $ff
  392. Case ETextStreamFormat.UTF16LE
  393. stream.WriteByte $ff
  394. stream.WriteByte $fe
  395. End Select
  396. End If
  397. Local TStream:TTextStream=TTextStream.Create( stream,format )
  398. TStream.WriteString str
  399. TStream.Close
  400. stream.Close
  401. Return True
  402. End Function
  403. Private
  404. Function IsProbablyUTF8:Int(data:Byte Ptr, size:Int)
  405. Local count:Int
  406. Local buf:Byte[6]
  407. Local encodeBuf:Byte[6]
  408. For Local i:Int = 0 Until size
  409. Local c:Int = data[i]
  410. If c < $80 Or (c & $c0) <> $80 Then
  411. If count > 0 Then
  412. Local char:Int = Decode(buf, count)
  413. If char = -1 Then
  414. Return False
  415. End If
  416. Local encodedCount:Int = Encode(char, encodeBuf, count)
  417. If count <> encodedCount Then
  418. Return False
  419. End If
  420. For Local n:Int = 0 Until count
  421. If buf[n] <> encodeBuf[n] Then
  422. Return False
  423. End If
  424. Next
  425. End If
  426. count = 0
  427. If c >= $80 Then
  428. buf[count] = c
  429. count :+ 1
  430. End If
  431. Else
  432. If count = 6 Then
  433. Return False
  434. End If
  435. buf[count] = c
  436. count :+ 1
  437. End If
  438. Next
  439. If count Then
  440. Return False
  441. End If
  442. Return True
  443. End Function
  444. Function Decode:Int(buf:Byte Ptr, count:Int)
  445. If count <= 0 Then
  446. Return -1
  447. End If
  448. If count = 1 Then
  449. If buf[0] >= $80 Then
  450. Return -1
  451. Else
  452. Return buf[0]
  453. End If
  454. End If
  455. Local bits:Int = 0
  456. Local c:Int = buf[0]
  457. While c & $80 = $80
  458. bits :+ 1
  459. c :Shl 1
  460. Wend
  461. If bits <> count Then
  462. Return -1
  463. End If
  464. Local v:Int = buf[0] & ($ff Shr bits)
  465. For Local i:Int = 1 Until count
  466. If buf[i] & $c0 <> $80 Then
  467. Return -1
  468. End If
  469. v = (v Shl 6) | (buf[i] & $3f)
  470. Next
  471. If v >= $d800 And v <= $dfff Then
  472. Return -1
  473. End If
  474. If v = $fffe Or v = $ffff Then
  475. Return -1
  476. End If
  477. Return v
  478. End Function
  479. Function Encode:Int(char:Int, buf:Byte Ptr, count:Int)
  480. If char<128
  481. buf[0] = char
  482. Return 1
  483. Else If char<2048
  484. If count <> 2 Then
  485. Return -1
  486. End If
  487. buf[0] = char/64 | 192
  488. buf[1] = char Mod 64 | 128
  489. Return 2
  490. Else If char < $10000
  491. If count <> 3 Then
  492. Return -1
  493. End If
  494. buf[0] = char/4096 | 224
  495. buf[1] = char/64 Mod 64 | 128
  496. buf[2] = char Mod 64 | 128
  497. Return 3
  498. Else
  499. If count <> 4 Then
  500. Return -1
  501. End If
  502. buf[0] = (char Shr 18) | $f0
  503. buf[1] = ((char Shr 12) & $3f) | $80
  504. buf[2] = ((char Shr 6) & $3f) | $80
  505. buf[3] = (char & $3f) | $80
  506. Return 4
  507. End If
  508. Return -1
  509. End Function