csvreadwrite.pp 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630
  1. {
  2. CSV Parser, Builder classes.
  3. Version 0.5 2014-10-25
  4. Copyright (C) 2010-2014 Vladimir Zhirov <[email protected]>
  5. Contributors:
  6. Luiz Americo Pereira Camara
  7. Mattias Gaertner
  8. Reinier Olislagers
  9. This library is free software; you can redistribute it and/or modify it
  10. under the terms of the GNU Library General Public License as published by
  11. the Free Software Foundation; either version 2 of the License, or (at your
  12. option) any later version with the following modification:
  13. As a special exception, the copyright holders of this library give you
  14. permission to link this library with independent modules to produce an
  15. executable, regardless of the license terms of these independent modules,and
  16. to copy and distribute the resulting executable under terms of your choice,
  17. provided that you also meet, for each linked independent module, the terms
  18. and conditions of the license of that module. An independent module is a
  19. module which is not derived from or based on this library. If you modify
  20. this library, you may extend this exception to your version of the library,
  21. but you are not obligated to do so. If you do not wish to do so, delete this
  22. exception statement from your version.
  23. This program is distributed in the hope that it will be useful, but WITHOUT
  24. ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  25. FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License
  26. for more details.
  27. You should have received a copy of the GNU Library General Public License
  28. along with this library; if not, write to the Free Software Foundation,
  29. Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  30. }
  31. unit csvreadwrite;
  32. {$mode objfpc}{$H+}
  33. interface
  34. uses
  35. Classes, SysUtils, strutils;
  36. Type
  37. TCSVChar = Char;
  38. { TCSVHandler }
  39. TCSVHandler = class(TPersistent)
  40. private
  41. procedure SetDelimiter(const AValue: TCSVChar);
  42. procedure SetQuoteChar(const AValue: TCSVChar);
  43. procedure UpdateCachedChars;
  44. protected
  45. // special chars
  46. FDelimiter: TCSVChar;
  47. FQuoteChar: TCSVChar;
  48. FLineEnding: String;
  49. // cached values to speed up special chars operations
  50. FSpecialChars: TSysCharSet;
  51. FDoubleQuote: String;
  52. // parser settings
  53. FIgnoreOuterWhitespace: Boolean;
  54. // builder settings
  55. FQuoteOuterWhitespace: Boolean;
  56. // document settings
  57. FEqualColCountPerRow: Boolean;
  58. public
  59. constructor Create; virtual;
  60. procedure Assign(ASource: TPersistent); override;
  61. procedure AssignCSVProperties(ASource: TCSVHandler);
  62. // Delimiter that separates the field, e.g. comma, semicolon, tab
  63. property Delimiter: TCSVChar read FDelimiter write SetDelimiter;
  64. // Character used to quote "problematic" data
  65. // (e.g. with delimiters or spaces in them)
  66. // A common quotechar is "
  67. property QuoteChar: TCSVChar read FQuoteChar write SetQuoteChar;
  68. // String at the end of the line of data (e.g. CRLF)
  69. property LineEnding: String read FLineEnding write FLineEnding;
  70. // Ignore whitespace between delimiters and field data
  71. property IgnoreOuterWhitespace: Boolean read FIgnoreOuterWhitespace write FIgnoreOuterWhitespace;
  72. // Use quotes when outer whitespace is found
  73. property QuoteOuterWhitespace: Boolean read FQuoteOuterWhitespace write FQuoteOuterWhitespace;
  74. // When reading and writing: make sure every line has the same column count, create empty cells in the end of row if required
  75. property EqualColCountPerRow: Boolean read FEqualColCountPerRow write FEqualColCountPerRow;
  76. end;
  77. // Sequential input from CSV stream
  78. { TCSVParser }
  79. TCSVByteOrderMark = (bomNone, bomUTF8, bomUTF16LE, bomUTF16BE);
  80. TCSVParser = class(TCSVHandler)
  81. private
  82. FFreeStream: Boolean;
  83. // fields
  84. FSourceStream: TStream;
  85. FStrStreamWrapper: TStringStream;
  86. FBOM: TCSVByteOrderMark;
  87. FDetectBOM: Boolean;
  88. // parser state
  89. EndOfFile: Boolean;
  90. EndOfLine: Boolean;
  91. FCurrentChar: TCSVChar;
  92. FCurrentRow: Integer;
  93. FCurrentCol: Integer;
  94. FMaxColCount: Integer;
  95. // output buffers
  96. FCellBuffer: String;
  97. FWhitespaceBuffer: String;
  98. procedure ClearOutput;
  99. // basic parsing
  100. procedure SkipEndOfLine;
  101. procedure SkipDelimiter;
  102. procedure SkipWhitespace;
  103. procedure NextChar;
  104. // complex parsing
  105. procedure ParseCell;
  106. procedure ParseQuotedValue;
  107. // simple parsing
  108. procedure ParseValue;
  109. public
  110. constructor Create;
  111. destructor Destroy; override;
  112. // Source data stream
  113. procedure SetSource(AStream: TStream); overload;
  114. // Source data string.
  115. procedure SetSource(const AString: String); overload;
  116. // Rewind to beginning of data
  117. procedure ResetParser;
  118. // Read next cell data; return false if end of file reached
  119. function ParseNextCell: Boolean;
  120. // Current row (0 based)
  121. property CurrentRow: Integer read FCurrentRow;
  122. // Current column (0 based); -1 if invalid/before beginning of file
  123. property CurrentCol: Integer read FCurrentCol;
  124. // Data in current cell
  125. property CurrentCellText: String read FCellBuffer;
  126. // The maximum number of columns found in the stream:
  127. property MaxColCount: Integer read FMaxColCount;
  128. // Does the parser own the stream ? If true, a previous stream is freed when set or when parser is destroyed.
  129. Property FreeStream : Boolean Read FFreeStream Write FFreeStream;
  130. // Return BOM found in file
  131. property BOM: TCSVByteOrderMark read FBOM;
  132. // Detect whether a BOM marker is present. If set to True, then BOM can be used to see what BOM marker there was.
  133. property DetectBOM: Boolean read FDetectBOM write FDetectBOM default false;
  134. end;
  135. // Sequential output to CSV stream
  136. TCSVBuilder = class(TCSVHandler)
  137. private
  138. FOutputStream: TStream;
  139. FDefaultOutput: TMemoryStream;
  140. FNeedLeadingDelimiter: Boolean;
  141. function GetDefaultOutputAsString: String;
  142. protected
  143. procedure AppendStringToStream(const AString: String; AStream: TStream);
  144. function QuoteCSVString(const AValue: String): String;
  145. public
  146. constructor Create;
  147. destructor Destroy; override;
  148. // Set output/destination stream.
  149. // If not called, output is sent to DefaultOutput
  150. procedure SetOutput(AStream: TStream);
  151. // If using default stream, reset output to beginning.
  152. // If using user-defined stream, user should reposition stream himself
  153. procedure ResetBuilder;
  154. // Add a cell to the output with data AValue
  155. procedure AppendCell(const AValue: String);
  156. // Write end of row to the output, starting a new row
  157. procedure AppendRow;
  158. // Default output as memorystream (if output not set using SetOutput)
  159. property DefaultOutput: TMemoryStream read FDefaultOutput;
  160. // Default output in string format (if output not set using SetOutput)
  161. property DefaultOutputAsString: String read GetDefaultOutputAsString;
  162. end;
  163. function ChangeLineEndings(const AString, ALineEnding: String): String;
  164. implementation
  165. const
  166. CsvCharSize = SizeOf(TCSVChar);
  167. CR = #13;
  168. LF = #10;
  169. HTAB = #9;
  170. SPACE = #32;
  171. WhitespaceChars = [HTAB, SPACE];
  172. LineEndingChars = [CR, LF];
  173. // The following implementation of ChangeLineEndings function originates from
  174. // Lazarus CodeTools library by Mattias Gaertner. It was explicitly allowed
  175. // by Mattias to relicense it under modified LGPL and include into CsvDocument.
  176. function ChangeLineEndings(const AString, ALineEnding: String): String;
  177. var
  178. I: Integer;
  179. Src: PChar;
  180. Dest: PChar;
  181. DestLength: Integer;
  182. EndingLength: Integer;
  183. EndPos: PChar;
  184. begin
  185. if AString = '' then
  186. Exit(AString);
  187. EndingLength := Length(ALineEnding);
  188. DestLength := Length(AString);
  189. Src := PChar(AString);
  190. EndPos := Src + DestLength;
  191. while Src < EndPos do
  192. begin
  193. if (Src^ = CR) then
  194. begin
  195. Inc(Src);
  196. if (Src^ = LF) then
  197. begin
  198. Inc(Src);
  199. Inc(DestLength, EndingLength - 2);
  200. end else
  201. Inc(DestLength, EndingLength - 1);
  202. end else
  203. begin
  204. if (Src^ = LF) then
  205. Inc(DestLength, EndingLength - 1);
  206. Inc(Src);
  207. end;
  208. end;
  209. SetLength(Result, DestLength);
  210. Src := PChar(AString);
  211. Dest := PChar(Result);
  212. EndPos := Dest + DestLength;
  213. while (Dest < EndPos) do
  214. begin
  215. if Src^ in LineEndingChars then
  216. begin
  217. for I := 1 to EndingLength do
  218. begin
  219. Dest^ := ALineEnding[I];
  220. Inc(Dest);
  221. end;
  222. if (Src^ = CR) and (Src[1] = LF) then
  223. Inc(Src, 2)
  224. else
  225. Inc(Src);
  226. end else
  227. begin
  228. Dest^ := Src^;
  229. Inc(Src);
  230. Inc(Dest);
  231. end;
  232. end;
  233. end;
  234. { TCSVHandler }
  235. procedure TCSVHandler.SetDelimiter(const AValue: TCSVChar);
  236. begin
  237. if FDelimiter <> AValue then
  238. begin
  239. FDelimiter := AValue;
  240. UpdateCachedChars;
  241. end;
  242. end;
  243. procedure TCSVHandler.SetQuoteChar(const AValue: TCSVChar);
  244. begin
  245. if FQuoteChar <> AValue then
  246. begin
  247. FQuoteChar := AValue;
  248. UpdateCachedChars;
  249. end;
  250. end;
  251. procedure TCSVHandler.UpdateCachedChars;
  252. begin
  253. FDoubleQuote := FQuoteChar + FQuoteChar;
  254. FSpecialChars := [CR, LF, FDelimiter, FQuoteChar];
  255. end;
  256. constructor TCSVHandler.Create;
  257. begin
  258. inherited Create;
  259. FDelimiter := ',';
  260. FQuoteChar := '"';
  261. FLineEnding := sLineBreak;
  262. FIgnoreOuterWhitespace := False;
  263. FQuoteOuterWhitespace := True;
  264. FEqualColCountPerRow := True;
  265. UpdateCachedChars;
  266. end;
  267. procedure TCSVHandler.Assign(ASource: TPersistent);
  268. begin
  269. if (ASource is TCSVHandler) then
  270. AssignCSVProperties(ASource as TCSVHandler)
  271. else
  272. inherited Assign(ASource);
  273. end;
  274. procedure TCSVHandler.AssignCSVProperties(ASource: TCSVHandler);
  275. begin
  276. FDelimiter := ASource.FDelimiter;
  277. FQuoteChar := ASource.FQuoteChar;
  278. FLineEnding := ASource.FLineEnding;
  279. FIgnoreOuterWhitespace := ASource.FIgnoreOuterWhitespace;
  280. FQuoteOuterWhitespace := ASource.FQuoteOuterWhitespace;
  281. FEqualColCountPerRow := ASource.FEqualColCountPerRow;
  282. UpdateCachedChars;
  283. end;
  284. { TCSVParser }
  285. procedure TCSVParser.ClearOutput;
  286. begin
  287. FCellBuffer := '';
  288. FWhitespaceBuffer := '';
  289. FCurrentRow := 0;
  290. FCurrentCol := -1;
  291. FMaxColCount := 0;
  292. end;
  293. procedure TCSVParser.SkipEndOfLine;
  294. begin
  295. // treat LF+CR as two linebreaks, not one
  296. if (FCurrentChar = CR) then
  297. NextChar;
  298. if (FCurrentChar = LF) then
  299. NextChar;
  300. end;
  301. procedure TCSVParser.SkipDelimiter;
  302. begin
  303. if FCurrentChar = FDelimiter then
  304. NextChar;
  305. end;
  306. procedure TCSVParser.SkipWhitespace;
  307. begin
  308. while FCurrentChar = SPACE do
  309. NextChar;
  310. end;
  311. procedure TCSVParser.NextChar;
  312. begin
  313. if FSourceStream.Read(FCurrentChar, CsvCharSize) < CsvCharSize then
  314. begin
  315. FCurrentChar := #0;
  316. EndOfFile := True;
  317. end;
  318. EndOfLine := FCurrentChar in LineEndingChars;
  319. end;
  320. procedure TCSVParser.ParseCell;
  321. begin
  322. FCellBuffer := '';
  323. if FIgnoreOuterWhitespace then
  324. SkipWhitespace;
  325. if FCurrentChar = FQuoteChar then
  326. ParseQuotedValue
  327. else
  328. ParseValue;
  329. end;
  330. procedure TCSVParser.ParseQuotedValue;
  331. var
  332. QuotationEnd: Boolean;
  333. begin
  334. NextChar; // skip opening quotation char
  335. repeat
  336. // read value up to next quotation char
  337. while not ((FCurrentChar = FQuoteChar) or EndOfFile) do
  338. begin
  339. if EndOfLine then
  340. begin
  341. AppendStr(FCellBuffer, FLineEnding);
  342. SkipEndOfLine;
  343. end else
  344. begin
  345. AppendStr(FCellBuffer, FCurrentChar);
  346. NextChar;
  347. end;
  348. end;
  349. // skip quotation char (closing or escaping)
  350. if not EndOfFile then
  351. NextChar;
  352. // check if it was escaping
  353. if FCurrentChar = FQuoteChar then
  354. begin
  355. AppendStr(FCellBuffer, FCurrentChar);
  356. QuotationEnd := False;
  357. NextChar;
  358. end else
  359. QuotationEnd := True;
  360. until QuotationEnd;
  361. // read the rest of the value until separator or new line
  362. ParseValue;
  363. end;
  364. procedure TCSVParser.ParseValue;
  365. begin
  366. while not ((FCurrentChar = FDelimiter) or EndOfLine or EndOfFile) do
  367. begin
  368. AppendStr(FWhitespaceBuffer, FCurrentChar);
  369. NextChar;
  370. end;
  371. // merge whitespace buffer
  372. if FIgnoreOuterWhitespace then
  373. RemoveTrailingChars(FWhitespaceBuffer, WhitespaceChars);
  374. AppendStr(FCellBuffer, FWhitespaceBuffer);
  375. FWhitespaceBuffer := '';
  376. end;
  377. constructor TCSVParser.Create;
  378. begin
  379. inherited Create;
  380. ClearOutput;
  381. FStrStreamWrapper := nil;
  382. EndOfFile := True;
  383. end;
  384. destructor TCSVParser.Destroy;
  385. begin
  386. if FFreeStream and (FSourceStream<>FStrStreamWrapper) then
  387. FreeAndNil(FSourceStream);
  388. FreeAndNil(FStrStreamWrapper);
  389. inherited Destroy;
  390. end;
  391. procedure TCSVParser.SetSource(AStream: TStream);
  392. begin
  393. If FSourceStream=AStream then exit;
  394. if FFreeStream and (FSourceStream<>FStrStreamWrapper) then
  395. FreeAndNil(FSourceStream);
  396. FSourceStream := AStream;
  397. ResetParser;
  398. end;
  399. procedure TCSVParser.SetSource(const AString: String); overload;
  400. begin
  401. FreeAndNil(FStrStreamWrapper);
  402. FStrStreamWrapper := TStringStream.Create(AString);
  403. SetSource(FStrStreamWrapper);
  404. end;
  405. procedure TCSVParser.ResetParser;
  406. var
  407. b: packed array[0..2] of byte;
  408. n: Integer;
  409. begin
  410. ClearOutput;
  411. FSourceStream.Seek(0, soFromBeginning);
  412. if FDetectBOM then
  413. begin
  414. FSourceStream.ReadBuffer(b[0], 3);
  415. if (b[0] = $EF) and (b[1] = $BB) and (b[2] = $BF) then begin
  416. FBOM := bomUTF8;
  417. n := 3;
  418. end else
  419. if (b[0] = $FE) and (b[1] = $FF) then begin
  420. FBOM := bomUTF16BE;
  421. n := 2;
  422. end else
  423. if (b[0] = $FF) and (b[1] = $FE) then begin
  424. FBOM := bomUTF16LE;
  425. n := 2;
  426. end else begin
  427. FBOM := bomNone;
  428. n := 0;
  429. end;
  430. FSourceStream.Seek(n, soFromBeginning);
  431. end;
  432. EndOfFile := False;
  433. NextChar;
  434. end;
  435. // Parses next cell; returns True if there are more cells in the input stream.
  436. function TCSVParser.ParseNextCell: Boolean;
  437. var
  438. LineColCount: Integer;
  439. begin
  440. if EndOfLine or EndOfFile then
  441. begin
  442. // Having read the previous line, adjust column count if necessary:
  443. LineColCount := FCurrentCol + 1;
  444. if LineColCount > FMaxColCount then
  445. FMaxColCount := LineColCount;
  446. end;
  447. if EndOfFile then
  448. Exit(False);
  449. // Handle line ending
  450. if EndOfLine then
  451. begin
  452. SkipEndOfLine;
  453. if EndOfFile then
  454. Exit(False);
  455. FCurrentCol := 0;
  456. Inc(FCurrentRow);
  457. end else
  458. Inc(FCurrentCol);
  459. // Skipping a delimiter should be immediately followed by parsing a cell
  460. // without checking for line break first, otherwise we miss last empty cell.
  461. // But 0th cell does not start with delimiter unlike other cells, so
  462. // the following check is required not to miss the first empty cell:
  463. if FCurrentCol > 0 then
  464. SkipDelimiter;
  465. ParseCell;
  466. Result := True;
  467. end;
  468. { TCSVBuilder }
  469. function TCSVBuilder.GetDefaultOutputAsString: String;
  470. var
  471. StreamSize: Integer;
  472. begin
  473. Result := '';
  474. StreamSize := FDefaultOutput.Size;
  475. if StreamSize > 0 then
  476. begin
  477. SetLength(Result, StreamSize);
  478. FDefaultOutput.ReadBuffer(Result[1], StreamSize);
  479. end;
  480. end;
  481. procedure TCSVBuilder.AppendStringToStream(const AString: String; AStream: TStream);
  482. var
  483. StrLen: Integer;
  484. begin
  485. StrLen := Length(AString);
  486. if StrLen > 0 then
  487. AStream.WriteBuffer(AString[1], StrLen);
  488. end;
  489. function TCSVBuilder.QuoteCSVString(const AValue: String): String;
  490. var
  491. I: Integer;
  492. ValueLen: Integer;
  493. NeedQuotation: Boolean;
  494. begin
  495. ValueLen := Length(AValue);
  496. NeedQuotation := (AValue <> '') and FQuoteOuterWhitespace
  497. and ((AValue[1] in WhitespaceChars) or (AValue[ValueLen] in WhitespaceChars));
  498. if not NeedQuotation then
  499. for I := 1 to ValueLen do
  500. begin
  501. if AValue[I] in FSpecialChars then
  502. begin
  503. NeedQuotation := True;
  504. Break;
  505. end;
  506. end;
  507. if NeedQuotation then
  508. begin
  509. // double existing quotes
  510. Result := FDoubleQuote;
  511. Insert(StringReplace(AValue, FQuoteChar, FDoubleQuote, [rfReplaceAll]),
  512. Result, 2);
  513. end else
  514. Result := AValue;
  515. end;
  516. constructor TCSVBuilder.Create;
  517. begin
  518. inherited Create;
  519. FDefaultOutput := TMemoryStream.Create;
  520. FOutputStream := FDefaultOutput;
  521. end;
  522. destructor TCSVBuilder.Destroy;
  523. begin
  524. FreeAndNil(FDefaultOutput);
  525. inherited Destroy;
  526. end;
  527. procedure TCSVBuilder.SetOutput(AStream: TStream);
  528. begin
  529. if Assigned(AStream) then
  530. FOutputStream := AStream
  531. else
  532. FOutputStream := FDefaultOutput;
  533. ResetBuilder;
  534. end;
  535. procedure TCSVBuilder.ResetBuilder;
  536. begin
  537. if FOutputStream = FDefaultOutput then
  538. FDefaultOutput.Clear;
  539. // Do not clear external FOutputStream because it may be pipe stream
  540. // or something else that does not support size and position.
  541. // To clear external output is up to the user of TCSVBuilder.
  542. FNeedLeadingDelimiter := False;
  543. end;
  544. procedure TCSVBuilder.AppendCell(const AValue: String);
  545. var
  546. CellValue: String;
  547. begin
  548. if FNeedLeadingDelimiter then
  549. FOutputStream.WriteBuffer(FDelimiter, CsvCharSize);
  550. CellValue := ChangeLineEndings(AValue, FLineEnding);
  551. CellValue := QuoteCSVString(CellValue);
  552. AppendStringToStream(CellValue, FOutputStream);
  553. FNeedLeadingDelimiter := True;
  554. end;
  555. procedure TCSVBuilder.AppendRow;
  556. begin
  557. AppendStringToStream(FLineEnding, FOutputStream);
  558. FNeedLeadingDelimiter := False;
  559. end;
  560. end.