utf8stream.bmx 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363
  1. ' Copyright (c) 2023 Bruce A Henderson
  2. '
  3. ' This software is provided 'as-is', without any express or implied
  4. ' warranty. In no event will the authors be held liable for any damages
  5. ' arising from the use of this software.
  6. '
  7. ' Permission is granted to anyone to use this software for any purpose,
  8. ' including commercial applications, and to alter it and redistribute it
  9. ' freely, subject to the following restrictions:
  10. '
  11. ' 1. The origin of this software must not be misrepresented; you must not
  12. ' claim that you wrote the original software. If you use this software
  13. ' in a product, an acknowledgment in the product documentation would be
  14. ' appreciated but is not required.
  15. '
  16. ' 2. Altered source versions must be plainly marked as such, and must not be
  17. ' misrepresented as being the original software.
  18. '
  19. ' 3. This notice may not be removed or altered from any source
  20. ' distribution.
  21. '
  22. SuperStrict
  23. Rem
  24. bbdoc: Converts a stream of one encoding to a stream of UTF8 bytes.
  25. End Rem
  26. Module BRL.UTF8Stream
  27. ModuleInfo "Version: 1.00"
  28. ModuleInfo "Author: Bruce A Henderson"
  29. ModuleInfo "License: zlib"
  30. ModuleInfo "Copyright: 2023 Bruce A Henderson"
  31. ModuleInfo "History: 1.00"
  32. ModuleInfo "History: Initial Release"
  33. Import BRL.Stream
  34. Rem
  35. bbdoc: Supported encodings.
  36. about: Only a few of the most common encodings are provided in BRL.UTF8Stream.
  37. More encodings are available in the Text.Encoding module.
  38. End Rem
  39. Enum EStreamEncoding
  40. UTF16
  41. WINDOWS_1252
  42. CP1252=WINDOWS_1252
  43. ISO_8859_1
  44. LATIN1=ISO_8859_1
  45. WINDOWS_1250
  46. CP1250=WINDOWS_1250
  47. WINDOWS_1251
  48. CP1251=WINDOWS_1251
  49. WINDOWS_1253
  50. CP1253=WINDOWS_1253
  51. WINDOWS_1254
  52. CP1254=WINDOWS_1254
  53. WINDOWS_1255
  54. CP1255=WINDOWS_1255
  55. WINDOWS_1256
  56. CP1256=WINDOWS_1256
  57. WINDOWS_1257
  58. CP1257=WINDOWS_1257
  59. ISO_8859_2
  60. ISO_8859_5
  61. ISO_8859_6
  62. ISO_8859_7
  63. ISO_8859_8
  64. ISO_8859_9
  65. ISO_8859_15
  66. End Enum
  67. Rem
  68. bbdoc: This class wraps a stream and converts its contents to UTF8.
  69. End Rem
  70. Type TEncodingToUTF8Stream Extends TStreamWrapper
  71. Field encodingStrategy:IEncodingStrategy
  72. Field remainingUTF8Char:SUTF8Char ' Store remaining bytes and their count
  73. Method New(stream:TStream, encoding:EStreamEncoding)
  74. _stream = stream
  75. Local loader:TEncodingStrategyLoader=utf8stream_strategies
  76. While loader
  77. If loader.Encoding()=encoding Then
  78. encodingStrategy=loader.Load(stream)
  79. Return
  80. End If
  81. loader=loader._succ
  82. Wend
  83. Throw New TEncodingNotAvailableException(encoding)
  84. End Method
  85. Method Read:Long(buf:Byte Ptr, count:Long) Override
  86. Local bytesRead:Long = 0
  87. ' Process remaining bytes from the previous call (if any)
  88. While remainingUTF8Char.count > 0 And bytesRead < count
  89. buf[0] = remainingUTF8Char.bytes[0]
  90. buf :+ 1
  91. bytesRead :+ 1
  92. remainingUTF8Char.bytes[0] = remainingUTF8Char.bytes[1]
  93. If remainingUTF8Char.count > 1 Then
  94. remainingUTF8Char.bytes[1] = remainingUTF8Char.bytes[2]
  95. If remainingUTF8Char.count > 2 Then
  96. remainingUTF8Char.bytes[2] = remainingUTF8Char.bytes[3]
  97. End if
  98. End if
  99. remainingUTF8Char.count :- 1
  100. Wend
  101. ' Local utf8Char:SUTF8Char
  102. While bytesRead < count
  103. encodingStrategy.ReadEncodedChar(remainingUTF8Char)
  104. If remainingUTF8Char.count = 0 Then Exit ' End of input reached
  105. If bytesRead + remainingUTF8Char.count <= count Then
  106. For Local i:Int = 0 Until remainingUTF8Char.count
  107. buf[0] = remainingUTF8Char.bytes[i]
  108. buf :+ 1
  109. bytesRead :+ 1
  110. Next
  111. remainingUTF8Char.count = 0 ' Reset remainingUTF8Char count
  112. Else
  113. ' Store the remaining bytes and their count
  114. Exit
  115. End If
  116. Wend
  117. Return bytesRead
  118. End Method
  119. End Type
  120. Struct SUTF8Char
  121. Field StaticArray bytes:Byte[4]
  122. Field count:Int
  123. End Struct
  124. Interface IEncodingStrategy
  125. Method ReadEncodedChar(utf8Char:SUTF8Char Var)
  126. End Interface
  127. Interface IEncodingStrategyLoader
  128. Method Encoding:EStreamEncoding()
  129. Method Load:IEncodingStrategy(stream:TStream)
  130. End Interface
  131. Private
  132. Global utf8stream_strategies:TEncodingStrategyLoader
  133. Public
  134. Type TEncodingStrategyLoader Implements IEncodingStrategyLoader Abstract
  135. Field _succ:TEncodingStrategyLoader
  136. Method New()
  137. _succ=utf8stream_strategies
  138. utf8stream_strategies=Self
  139. End Method
  140. End Type
  141. Type TEncodingStrategyLoaderUTF16 Extends TEncodingStrategyLoader
  142. Method Encoding:EStreamEncoding()
  143. Return EStreamEncoding.UTF16
  144. End Method
  145. Method Load:IEncodingStrategy(stream:TStream)
  146. Return New TUTF16EncodingStrategy(stream)
  147. End Method
  148. End Type
  149. New TEncodingStrategyLoaderUTF16
  150. Type TUTF16EncodingStrategy Implements IEncodingStrategy
  151. Field stream:TStream
  152. Method New(sourceStream:TStream)
  153. stream = sourceStream
  154. End Method
  155. Method ReadEncodedChar(utf8Char:SUTF8Char Var)
  156. Local StaticArray buf:Short[1]
  157. If stream.ReadShort(buf) = 0 Then
  158. utf8Char.count = 0
  159. Return
  160. End If
  161. Local unicodeChar:Int = buf[0]
  162. If unicodeChar < 128 Then
  163. utf8Char.bytes[0] = Byte(unicodeChar)
  164. utf8Char.count = 1
  165. ElseIf unicodeChar < 2048 Then
  166. utf8Char.bytes[0] = Byte(192 | (unicodeChar Shr 6))
  167. utf8Char.bytes[1] = Byte(128 | (unicodeChar & 63))
  168. utf8Char.count = 2
  169. Else
  170. utf8Char.bytes[0] = Byte(224 | (unicodeChar Shr 12))
  171. utf8Char.bytes[1] = Byte(128 | ((unicodeChar Shr 6) & 63))
  172. utf8Char.bytes[2] = Byte(128 | (unicodeChar & 63))
  173. utf8Char.count = 3
  174. End If
  175. End Method
  176. End Type
  177. Rem
  178. bbdoc: Thrown when an encoding is not available.
  179. End Rem
  180. Type TEncodingNotAvailableException Extends TBlitzException
  181. Field encoding:EStreamEncoding
  182. Method New(encoding:EStreamEncoding)
  183. Self.encoding = encoding
  184. End Method
  185. Method ToString:String()
  186. Return "Support for encoding " + encoding.ToString() + " has not been imported."
  187. End Method
  188. End Type
  189. Rem
  190. bbdoc: Base class for encoding strategies that use a single byte to represent a character.
  191. End Rem
  192. Type TBaseSingleByteEncodingStrategy Implements IEncodingStrategy Abstract
  193. Field stream:TStream
  194. Field StaticArray encodingTable:Short[128]
  195. Method New(sourceStream:TStream)
  196. stream = sourceStream
  197. End Method
  198. Method LoadTable(table:Short Ptr) Abstract
  199. Method LoadMapping()
  200. LoadTable(encodingTable)
  201. End Method
  202. Method ReadEncodedChar(utf8Char:SUTF8Char Var) Override
  203. Local StaticArray buf:Byte[1]
  204. If stream.Read(buf, 1) = 0 Then
  205. utf8Char.count = 0
  206. Return
  207. End If
  208. Local unicodeChar:Int = buf[0]
  209. If unicodeChar < 128
  210. utf8Char.bytes[0] = unicodeChar
  211. utf8Char.count = 1
  212. Else
  213. EncodeSingleByteToUTF8(utf8Char, encodingTable[unicodeChar - 128])
  214. End If
  215. End Method
  216. Method EncodeSingleByteToUTF8(utf8Char:SUTF8Char Var, c:Short)
  217. If c = 0 Then
  218. utf8Char.count = 0
  219. Else If c < 128
  220. utf8Char.bytes[0] = c
  221. utf8Char.count = 1
  222. Else If c < 2048 Then
  223. utf8Char.bytes[0] = ((c Shr 6) & 31) | 192
  224. utf8Char.bytes[1] = (c & 63) | 128
  225. utf8Char.count = 2
  226. Else
  227. utf8Char.bytes[0] = ((c Shr 12) & 15) | 224
  228. utf8Char.bytes[1] = ((c Shr 6) & 63) | 128
  229. utf8Char.bytes[2] = (c & 63) | 128
  230. utf8Char.count = 3
  231. End If
  232. End Method
  233. End Type
  234. Type TEncodingStrategyLoaderWindows1252 Extends TEncodingStrategyLoader
  235. Method Encoding:EStreamEncoding()
  236. Return EStreamEncoding.WINDOWS_1252
  237. End Method
  238. Method Load:IEncodingStrategy(stream:TStream)
  239. Return New TWindows1252EncodingStrategy(stream)
  240. End Method
  241. End Type
  242. New TEncodingStrategyLoaderWindows1252
  243. Rem
  244. bbdoc: An encoding strategy for Windows-1252.
  245. End Rem
  246. Type TWindows1252EncodingStrategy Extends TBaseSingleByteEncodingStrategy
  247. Method New(sourceStream:TStream)
  248. stream = sourceStream
  249. LoadMapping()
  250. End Method
  251. Method LoadTable(table:Short Ptr)
  252. Global encodingTable:Short[]
  253. If Not encodingTable Then
  254. encodingTable = [..
  255. $20AC:Short, $FFFD:Short, $201A:Short, $0192:Short, $201E:Short, $2026:Short, $2020:Short, $2021:Short, $02C6:Short, $2030:Short, $0160:Short, $2039:Short, $0152:Short, $FFFD:Short, $017D:Short, $FFFD:Short,..
  256. $FFFD:Short, $2018:Short, $2019:Short, $201C:Short, $201D:Short, $2022:Short, $2013:Short, $2014:Short, $02DC:Short, $2122:Short, $0161:Short, $203A:Short, $0153:Short, $FFFD:Short, $017E:Short, $0178:Short,..
  257. $00A0:Short, $00A1:Short, $00A2:Short, $00A3:Short, $00A4:Short, $00A5:Short, $00A6:Short, $00A7:Short, $00A8:Short, $00A9:Short, $00AA:Short, $00AB:Short, $00AC:Short, $00AD:Short, $00AE:Short, $00AF:Short,..
  258. $00B0:Short, $00B1:Short, $00B2:Short, $00B3:Short, $00B4:Short, $00B5:Short, $00B6:Short, $00B7:Short, $00B8:Short, $00B9:Short, $00BA:Short, $00BB:Short, $00BC:Short, $00BD:Short, $00BE:Short, $00BF:Short,..
  259. $00C0:Short, $00C1:Short, $00C2:Short, $00C3:Short, $00C4:Short, $00C5:Short, $00C6:Short, $00C7:Short, $00C8:Short, $00C9:Short, $00CA:Short, $00CB:Short, $00CC:Short, $00CD:Short, $00CE:Short, $00CF:Short,..
  260. $00D0:Short, $00D1:Short, $00D2:Short, $00D3:Short, $00D4:Short, $00D5:Short, $00D6:Short, $00D7:Short, $00D8:Short, $00D9:Short, $00DA:Short, $00DB:Short, $00DC:Short, $00DD:Short, $00DE:Short, $00DF:Short,..
  261. $00E0:Short, $00E1:Short, $00E2:Short, $00E3:Short, $00E4:Short, $00E5:Short, $00E6:Short, $00E7:Short, $00E8:Short, $00E9:Short, $00EA:Short, $00EB:Short, $00EC:Short, $00ED:Short, $00EE:Short, $00EF:Short,..
  262. $00F0:Short, $00F1:Short, $00F2:Short, $00F3:Short, $00F4:Short, $00F5:Short, $00F6:Short, $00F7:Short, $00F8:Short, $00F9:Short, $00FA:Short, $00FB:Short, $00FC:Short, $00FD:Short, $00FE:Short, $00FF:Short]
  263. End If
  264. For Local i:Int = 0 To 127
  265. table[i] = encodingTable[i]
  266. Next
  267. End Method
  268. End Type
  269. Type TEncodingStrategyLoaderISO_8859_1 Extends TEncodingStrategyLoader
  270. Method Encoding:EStreamEncoding()
  271. Return EStreamEncoding.ISO_8859_1
  272. End Method
  273. Method Load:IEncodingStrategy(stream:TStream)
  274. Return New TISO_8859_1_EncodingStrategy(stream)
  275. End Method
  276. End Type
  277. New TEncodingStrategyLoaderISO_8859_1
  278. Rem
  279. bbdoc: An encoding strategy for ISO-8859-1.
  280. End Rem
  281. Type TISO_8859_1_EncodingStrategy Extends TBaseSingleByteEncodingStrategy
  282. Method New(sourceStream:TStream)
  283. stream = sourceStream
  284. LoadMapping()
  285. End Method
  286. Method LoadTable(table:Short Ptr)
  287. For Local i:Int = 0 To 127
  288. table[i] = i + 128
  289. Next
  290. End Method
  291. End Type