csvreadwrite.pp 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638
  1. {
  2. CSV Parser, Builder classes.
  3. Version 0.5 2014-10-25
  4. Copyright (C) 2010-2014 Vladimir Zhirov <[email protected]>
  5. Contributors:
  6. Luiz Americo Pereira Camara
  7. Mattias Gaertner
  8. Reinier Olislagers
  9. This library is free software; you can redistribute it and/or modify it
  10. under the terms of the GNU Library General Public License as published by
  11. the Free Software Foundation; either version 2 of the License, or (at your
  12. option) any later version with the following modification:
  13. As a special exception, the copyright holders of this library give you
  14. permission to link this library with independent modules to produce an
  15. executable, regardless of the license terms of these independent modules,and
  16. to copy and distribute the resulting executable under terms of your choice,
  17. provided that you also meet, for each linked independent module, the terms
  18. and conditions of the license of that module. An independent module is a
  19. module which is not derived from or based on this library. If you modify
  20. this library, you may extend this exception to your version of the library,
  21. but you are not obligated to do so. If you do not wish to do so, delete this
  22. exception statement from your version.
  23. This program is distributed in the hope that it will be useful, but WITHOUT
  24. ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  25. FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License
  26. for more details.
  27. You should have received a copy of the GNU Library General Public License
  28. along with this library; if not, write to the Free Software Foundation,
  29. Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  30. }
  31. unit csvreadwrite;
  32. {$mode objfpc}{$H+}
  33. interface
  34. uses
  35. Classes, SysUtils, strutils;
  36. Type
  37. TCSVChar = Char;
  38. { TCSVHandler }
  39. TCSVHandler = class(TPersistent)
  40. private
  41. procedure SetDelimiter(const AValue: TCSVChar);
  42. procedure SetQuoteChar(const AValue: TCSVChar);
  43. procedure UpdateCachedChars;
  44. protected
  45. // special chars
  46. FDelimiter: TCSVChar;
  47. FQuoteChar: TCSVChar;
  48. FLineEnding: String;
  49. // cached values to speed up special chars operations
  50. FSpecialChars: TSysCharSet;
  51. FDoubleQuote: String;
  52. // parser settings
  53. FIgnoreOuterWhitespace: Boolean;
  54. // builder settings
  55. FQuoteOuterWhitespace: Boolean;
  56. // document settings
  57. FEqualColCountPerRow: Boolean;
  58. public
  59. constructor Create; virtual;
  60. procedure Assign(ASource: TPersistent); override;
  61. procedure AssignCSVProperties(ASource: TCSVHandler);
  62. // Delimiter that separates the field, e.g. comma, semicolon, tab
  63. property Delimiter: TCSVChar read FDelimiter write SetDelimiter;
  64. // Character used to quote "problematic" data
  65. // (e.g. with delimiters or spaces in them)
  66. // A common quotechar is "
  67. property QuoteChar: TCSVChar read FQuoteChar write SetQuoteChar;
  68. // String at the end of the line of data (e.g. CRLF)
  69. property LineEnding: String read FLineEnding write FLineEnding;
  70. // Ignore whitespace between delimiters and field data
  71. property IgnoreOuterWhitespace: Boolean read FIgnoreOuterWhitespace write FIgnoreOuterWhitespace;
  72. // Use quotes when outer whitespace is found
  73. property QuoteOuterWhitespace: Boolean read FQuoteOuterWhitespace write FQuoteOuterWhitespace;
  74. // When reading and writing: make sure every line has the same column count, create empty cells in the end of row if required
  75. property EqualColCountPerRow: Boolean read FEqualColCountPerRow write FEqualColCountPerRow;
  76. end;
  77. // Sequential input from CSV stream
  78. { TCSVParser }
  79. TCSVByteOrderMark = (bomNone, bomUTF8, bomUTF16LE, bomUTF16BE);
  80. TCSVParser = class(TCSVHandler)
  81. private
  82. FFreeStream: Boolean;
  83. // fields
  84. FSourceStream: TStream;
  85. FStrStreamWrapper: TStringStream;
  86. FBOM: TCSVByteOrderMark;
  87. FDetectBOM: Boolean;
  88. // parser state
  89. EndOfFile: Boolean;
  90. EndOfLine: Boolean;
  91. FCurrentChar: TCSVChar;
  92. FCurrentRow: Integer;
  93. FCurrentCol: Integer;
  94. FMaxColCount: Integer;
  95. // output buffers
  96. FCellBuffer: String;
  97. FWhitespaceBuffer: String;
  98. procedure ClearOutput;
  99. // basic parsing
  100. procedure SkipEndOfLine;
  101. procedure SkipDelimiter;
  102. procedure SkipWhitespace;
  103. procedure NextChar;
  104. // complex parsing
  105. procedure ParseCell;
  106. procedure ParseQuotedValue;
  107. // simple parsing
  108. procedure ParseValue;
  109. public
  110. constructor Create; override;
  111. destructor Destroy; override;
  112. // Source data stream
  113. procedure SetSource(AStream: TStream); overload;
  114. // Source data string.
  115. procedure SetSource(const AString: String); overload;
  116. // Rewind to beginning of data
  117. procedure ResetParser;
  118. // Read next cell data; return false if end of file reached
  119. function ParseNextCell: Boolean;
  120. // Current row (0 based)
  121. property CurrentRow: Integer read FCurrentRow;
  122. // Current column (0 based); -1 if invalid/before beginning of file
  123. property CurrentCol: Integer read FCurrentCol;
  124. // Data in current cell
  125. property CurrentCellText: String read FCellBuffer;
  126. // The maximum number of columns found in the stream:
  127. property MaxColCount: Integer read FMaxColCount;
  128. // Does the parser own the stream ? If true, a previous stream is freed when set or when parser is destroyed.
  129. Property FreeStream : Boolean Read FFreeStream Write FFreeStream;
  130. // Return BOM found in file
  131. property BOM: TCSVByteOrderMark read FBOM;
  132. // Detect whether a BOM marker is present. If set to True, then BOM can be used to see what BOM marker there was.
  133. property DetectBOM: Boolean read FDetectBOM write FDetectBOM default false;
  134. end;
  135. // Sequential output to CSV stream
  136. TCSVBuilder = class(TCSVHandler)
  137. private
  138. FOutputStream: TStream;
  139. FDefaultOutput: TMemoryStream;
  140. FNeedLeadingDelimiter: Boolean;
  141. function GetDefaultOutputAsString: String;
  142. protected
  143. procedure AppendStringToStream(const AString: String; AStream: TStream);
  144. function QuoteCSVString(const AValue: String): String;
  145. public
  146. constructor Create; override;
  147. destructor Destroy; override;
  148. // Set output/destination stream.
  149. // If not called, output is sent to DefaultOutput
  150. procedure SetOutput(AStream: TStream);
  151. // If using default stream, reset output to beginning.
  152. // If using user-defined stream, user should reposition stream himself
  153. procedure ResetBuilder;
  154. // Add a cell to the output with data AValue
  155. procedure AppendCell(const AValue: String);
  156. // Write end of row to the output, starting a new row
  157. procedure AppendRow;
  158. // Default output as memorystream (if output not set using SetOutput)
  159. property DefaultOutput: TMemoryStream read FDefaultOutput;
  160. // Default output in string format (if output not set using SetOutput)
  161. property DefaultOutputAsString: String read GetDefaultOutputAsString;
  162. end;
  163. function ChangeLineEndings(const AString, ALineEnding: String): String;
  164. implementation
  165. const
  166. CsvCharSize = SizeOf(TCSVChar);
  167. CR = #13;
  168. LF = #10;
  169. HTAB = #9;
  170. SPACE = #32;
  171. WhitespaceChars = [HTAB, SPACE];
  172. LineEndingChars = [CR, LF];
  173. // The following implementation of ChangeLineEndings function originates from
  174. // Lazarus CodeTools library by Mattias Gaertner. It was explicitly allowed
  175. // by Mattias to relicense it under modified LGPL and include into CsvDocument.
  176. function ChangeLineEndings(const AString, ALineEnding: String): String;
  177. var
  178. I: Integer;
  179. Src: PChar;
  180. Dest: PChar;
  181. DestLength: Integer;
  182. EndingLength: Integer;
  183. EndPos: PChar;
  184. begin
  185. if AString = '' then
  186. Exit(AString);
  187. EndingLength := Length(ALineEnding);
  188. DestLength := Length(AString);
  189. Src := PChar(AString);
  190. EndPos := Src + DestLength;
  191. while Src < EndPos do
  192. begin
  193. if (Src^ = CR) then
  194. begin
  195. Inc(Src);
  196. if (Src^ = LF) then
  197. begin
  198. Inc(Src);
  199. Inc(DestLength, EndingLength - 2);
  200. end else
  201. Inc(DestLength, EndingLength - 1);
  202. end else
  203. begin
  204. if (Src^ = LF) then
  205. Inc(DestLength, EndingLength - 1);
  206. Inc(Src);
  207. end;
  208. end;
  209. SetLength(Result, DestLength);
  210. Src := PChar(AString);
  211. Dest := PChar(Result);
  212. EndPos := Dest + DestLength;
  213. while (Dest < EndPos) do
  214. begin
  215. if Src^ in LineEndingChars then
  216. begin
  217. for I := 1 to EndingLength do
  218. begin
  219. Dest^ := ALineEnding[I];
  220. Inc(Dest);
  221. end;
  222. if (Src^ = CR) and (Src[1] = LF) then
  223. Inc(Src, 2)
  224. else
  225. Inc(Src);
  226. end else
  227. begin
  228. Dest^ := Src^;
  229. Inc(Src);
  230. Inc(Dest);
  231. end;
  232. end;
  233. end;
  234. { TCSVHandler }
  235. procedure TCSVHandler.SetDelimiter(const AValue: TCSVChar);
  236. begin
  237. if FDelimiter <> AValue then
  238. begin
  239. FDelimiter := AValue;
  240. UpdateCachedChars;
  241. end;
  242. end;
  243. procedure TCSVHandler.SetQuoteChar(const AValue: TCSVChar);
  244. begin
  245. if FQuoteChar <> AValue then
  246. begin
  247. FQuoteChar := AValue;
  248. UpdateCachedChars;
  249. end;
  250. end;
  251. procedure TCSVHandler.UpdateCachedChars;
  252. begin
  253. FDoubleQuote := FQuoteChar + FQuoteChar;
  254. FSpecialChars := [CR, LF, FDelimiter, FQuoteChar];
  255. end;
  256. constructor TCSVHandler.Create;
  257. begin
  258. inherited Create;
  259. FDelimiter := ',';
  260. FQuoteChar := '"';
  261. FLineEnding := sLineBreak;
  262. FIgnoreOuterWhitespace := False;
  263. FQuoteOuterWhitespace := True;
  264. FEqualColCountPerRow := True;
  265. UpdateCachedChars;
  266. end;
  267. procedure TCSVHandler.Assign(ASource: TPersistent);
  268. begin
  269. if (ASource is TCSVHandler) then
  270. AssignCSVProperties(ASource as TCSVHandler)
  271. else
  272. inherited Assign(ASource);
  273. end;
  274. procedure TCSVHandler.AssignCSVProperties(ASource: TCSVHandler);
  275. begin
  276. FDelimiter := ASource.FDelimiter;
  277. FQuoteChar := ASource.FQuoteChar;
  278. FLineEnding := ASource.FLineEnding;
  279. FIgnoreOuterWhitespace := ASource.FIgnoreOuterWhitespace;
  280. FQuoteOuterWhitespace := ASource.FQuoteOuterWhitespace;
  281. FEqualColCountPerRow := ASource.FEqualColCountPerRow;
  282. UpdateCachedChars;
  283. end;
  284. { TCSVParser }
  285. procedure TCSVParser.ClearOutput;
  286. begin
  287. FCellBuffer := '';
  288. FWhitespaceBuffer := '';
  289. FCurrentRow := 0;
  290. FCurrentCol := -1;
  291. FMaxColCount := 0;
  292. end;
  293. procedure TCSVParser.SkipEndOfLine;
  294. begin
  295. // treat LF+CR as two linebreaks, not one
  296. if (FCurrentChar = CR) then
  297. NextChar;
  298. if (FCurrentChar = LF) then
  299. NextChar;
  300. end;
  301. procedure TCSVParser.SkipDelimiter;
  302. begin
  303. if FCurrentChar = FDelimiter then
  304. NextChar;
  305. end;
  306. procedure TCSVParser.SkipWhitespace;
  307. begin
  308. while FCurrentChar = SPACE do
  309. NextChar;
  310. end;
  311. procedure TCSVParser.NextChar;
  312. begin
  313. if FSourceStream.Read(FCurrentChar, CsvCharSize) < CsvCharSize then
  314. begin
  315. FCurrentChar := #0;
  316. EndOfFile := True;
  317. end;
  318. EndOfLine := FCurrentChar in LineEndingChars;
  319. end;
  320. procedure TCSVParser.ParseCell;
  321. begin
  322. FCellBuffer := '';
  323. if FIgnoreOuterWhitespace then
  324. SkipWhitespace;
  325. if FCurrentChar = FQuoteChar then
  326. ParseQuotedValue
  327. else
  328. ParseValue;
  329. end;
  330. procedure TCSVParser.ParseQuotedValue;
  331. var
  332. QuotationEnd: Boolean;
  333. begin
  334. NextChar; // skip opening quotation char
  335. repeat
  336. // read value up to next quotation char
  337. while not ((FCurrentChar = FQuoteChar) or EndOfFile) do
  338. begin
  339. if EndOfLine then
  340. begin
  341. AppendStr(FCellBuffer, FLineEnding);
  342. SkipEndOfLine;
  343. end else
  344. begin
  345. AppendStr(FCellBuffer, FCurrentChar);
  346. NextChar;
  347. end;
  348. end;
  349. // skip quotation char (closing or escaping)
  350. if not EndOfFile then
  351. NextChar;
  352. // check if it was escaping
  353. if FCurrentChar = FQuoteChar then
  354. begin
  355. AppendStr(FCellBuffer, FCurrentChar);
  356. QuotationEnd := False;
  357. NextChar;
  358. end else
  359. QuotationEnd := True;
  360. until QuotationEnd;
  361. // read the rest of the value until separator or new line
  362. ParseValue;
  363. end;
  364. procedure TCSVParser.ParseValue;
  365. begin
  366. while not ((FCurrentChar = FDelimiter) or EndOfLine or EndOfFile or (FCurrentChar = FQuoteChar)) do
  367. begin
  368. AppendStr(FCellBuffer, FCurrentChar);
  369. NextChar;
  370. end;
  371. if FCurrentChar = FQuoteChar then
  372. ParseQuotedValue;
  373. // merge whitespace buffer
  374. if FIgnoreOuterWhitespace then
  375. RemoveTrailingChars(FWhitespaceBuffer, WhitespaceChars);
  376. AppendStr(FWhitespaceBuffer,FCellBuffer);
  377. FWhitespaceBuffer := '';
  378. end;
  379. constructor TCSVParser.Create;
  380. begin
  381. inherited Create;
  382. ClearOutput;
  383. FStrStreamWrapper := nil;
  384. EndOfFile := True;
  385. end;
  386. destructor TCSVParser.Destroy;
  387. begin
  388. if FFreeStream and (FSourceStream<>FStrStreamWrapper) then
  389. FreeAndNil(FSourceStream);
  390. FreeAndNil(FStrStreamWrapper);
  391. inherited Destroy;
  392. end;
  393. procedure TCSVParser.SetSource(AStream: TStream);
  394. begin
  395. If FSourceStream=AStream then exit;
  396. if FFreeStream and (FSourceStream<>FStrStreamWrapper) then
  397. FreeAndNil(FSourceStream);
  398. FSourceStream := AStream;
  399. ResetParser;
  400. end;
  401. procedure TCSVParser.SetSource(const AString: String); overload;
  402. begin
  403. FreeAndNil(FStrStreamWrapper);
  404. FStrStreamWrapper := TStringStream.Create(AString);
  405. SetSource(FStrStreamWrapper);
  406. end;
  407. procedure TCSVParser.ResetParser;
  408. var
  409. b: packed array[0..2] of byte;
  410. n: Integer;
  411. begin
  412. B[0]:=0; B[1]:=0; B[2]:=0;
  413. ClearOutput;
  414. FSourceStream.Seek(0, soFromBeginning);
  415. if FDetectBOM then
  416. begin
  417. if FSourceStream.Read(b[0], 3)<3 then
  418. begin
  419. n:=0;
  420. FBOM:=bomNone;
  421. end
  422. else if (b[0] = $EF) and (b[1] = $BB) and (b[2] = $BF) then begin
  423. FBOM := bomUTF8;
  424. n := 3;
  425. end else
  426. if (b[0] = $FE) and (b[1] = $FF) then begin
  427. FBOM := bomUTF16BE;
  428. n := 2;
  429. end else
  430. if (b[0] = $FF) and (b[1] = $FE) then begin
  431. FBOM := bomUTF16LE;
  432. n := 2;
  433. end else begin
  434. FBOM := bomNone;
  435. n := 0;
  436. end;
  437. FSourceStream.Seek(n, soFromBeginning);
  438. end;
  439. EndOfFile := False;
  440. NextChar;
  441. end;
  442. // Parses next cell; returns True if there are more cells in the input stream.
  443. function TCSVParser.ParseNextCell: Boolean;
  444. var
  445. LineColCount: Integer;
  446. begin
  447. if EndOfLine or EndOfFile then
  448. begin
  449. // Having read the previous line, adjust column count if necessary:
  450. LineColCount := FCurrentCol + 1;
  451. if LineColCount > FMaxColCount then
  452. FMaxColCount := LineColCount;
  453. end;
  454. if EndOfFile then
  455. Exit(False);
  456. // Handle line ending
  457. if EndOfLine then
  458. begin
  459. SkipEndOfLine;
  460. if EndOfFile then
  461. Exit(False);
  462. FCurrentCol := 0;
  463. Inc(FCurrentRow);
  464. end else
  465. Inc(FCurrentCol);
  466. // Skipping a delimiter should be immediately followed by parsing a cell
  467. // without checking for line break first, otherwise we miss last empty cell.
  468. // But 0th cell does not start with delimiter unlike other cells, so
  469. // the following check is required not to miss the first empty cell:
  470. if FCurrentCol > 0 then
  471. SkipDelimiter;
  472. ParseCell;
  473. Result := True;
  474. end;
  475. { TCSVBuilder }
  476. function TCSVBuilder.GetDefaultOutputAsString: String;
  477. var
  478. StreamSize: Integer;
  479. begin
  480. Result := '';
  481. StreamSize := FDefaultOutput.Size;
  482. if StreamSize > 0 then
  483. begin
  484. SetLength(Result, StreamSize);
  485. FDefaultOutput.Position:=0;
  486. FDefaultOutput.ReadBuffer(Result[1], StreamSize);
  487. end;
  488. end;
  489. procedure TCSVBuilder.AppendStringToStream(const AString: String; AStream: TStream);
  490. var
  491. StrLen: Integer;
  492. begin
  493. StrLen := Length(AString);
  494. if StrLen > 0 then
  495. AStream.WriteBuffer(AString[1], StrLen);
  496. end;
  497. function TCSVBuilder.QuoteCSVString(const AValue: String): String;
  498. var
  499. I: Integer;
  500. ValueLen: Integer;
  501. NeedQuotation: Boolean;
  502. begin
  503. ValueLen := Length(AValue);
  504. NeedQuotation := (AValue <> '') and FQuoteOuterWhitespace
  505. and ((AValue[1] in WhitespaceChars) or (AValue[ValueLen] in WhitespaceChars));
  506. if not NeedQuotation then
  507. for I := 1 to ValueLen do
  508. begin
  509. if AValue[I] in FSpecialChars then
  510. begin
  511. NeedQuotation := True;
  512. Break;
  513. end;
  514. end;
  515. if NeedQuotation then
  516. begin
  517. // double existing quotes
  518. Result := FDoubleQuote;
  519. Insert(StringReplace(AValue, FQuoteChar, FDoubleQuote, [rfReplaceAll]),
  520. Result, 2);
  521. end else
  522. Result := AValue;
  523. end;
  524. constructor TCSVBuilder.Create;
  525. begin
  526. inherited Create;
  527. FDefaultOutput := TMemoryStream.Create;
  528. FOutputStream := FDefaultOutput;
  529. end;
  530. destructor TCSVBuilder.Destroy;
  531. begin
  532. FreeAndNil(FDefaultOutput);
  533. inherited Destroy;
  534. end;
  535. procedure TCSVBuilder.SetOutput(AStream: TStream);
  536. begin
  537. if Assigned(AStream) then
  538. FOutputStream := AStream
  539. else
  540. FOutputStream := FDefaultOutput;
  541. ResetBuilder;
  542. end;
  543. procedure TCSVBuilder.ResetBuilder;
  544. begin
  545. if FOutputStream = FDefaultOutput then
  546. FDefaultOutput.Clear;
  547. // Do not clear external FOutputStream because it may be pipe stream
  548. // or something else that does not support size and position.
  549. // To clear external output is up to the user of TCSVBuilder.
  550. FNeedLeadingDelimiter := False;
  551. end;
  552. procedure TCSVBuilder.AppendCell(const AValue: String);
  553. var
  554. CellValue: String;
  555. begin
  556. if FNeedLeadingDelimiter then
  557. FOutputStream.WriteBuffer(FDelimiter, CsvCharSize);
  558. CellValue := ChangeLineEndings(AValue, FLineEnding);
  559. CellValue := QuoteCSVString(CellValue);
  560. AppendStringToStream(CellValue, FOutputStream);
  561. FNeedLeadingDelimiter := True;
  562. end;
  563. procedure TCSVBuilder.AppendRow;
  564. begin
  565. AppendStringToStream(FLineEnding, FOutputStream);
  566. FNeedLeadingDelimiter := False;
  567. end;
  568. end.