/* Part of SWI-Prolog Author: Jan Wielemaker E-mail: J.Wielemaker@vu.nl WWW: http://www.swi-prolog.org Copyright (c) 2009-2022, VU University Amsterdam CWI, Amsterdam, SWI-Prolog Solutions b.v. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ :- module(csv, [ csv//1, % +Rows csv//2, % +Rows, +Options csv_read_file/2, % +File, -Data csv_read_file/3, % +File, -Data, +Options csv_read_stream/3, % +Stream, -Data, +Options csv_read_file_row/3, % +File, -Row, +Options csv_read_row/3, % +Stream, -Row, +CompiledOptions csv_options/2, % -Compiled, +Options csv_write_file/2, % +File, +Data csv_write_file/3, % +File, +Data, +Options csv_write_stream/3 % +Stream, +Data, +Options ]). :- use_module(library(record),[(record)/1, op(_,_,record)]). :- autoload(library(apply),[maplist/2]). :- autoload(library(debug),[debug/3]). :- autoload(library(error),[must_be/2,domain_error/2]). :- autoload(library(lists),[append/3]). :- autoload(library(option),[option/2,select_option/4]). :- autoload(library(pure_input), [phrase_from_file/3,phrase_from_stream/2]). :- autoload(library(readutil),[read_line_to_codes/2]). :- autoload(library(dcg/basics),[string//1,eos//0]). /** Process CSV (Comma-Separated Values) data This library parses and generates CSV data. CSV data is represented in Prolog as a list of rows. Each row is a compound term, where all rows have the same name and arity. @tbd Implement immediate assert of the data to avoid possible stack overflows. @tbd Writing creates an intermediate code-list, possibly overflowing resources. This waits for pure output! @see RFC 4180 */ :- predicate_options(csv//2, 2, [ separator(nonneg), % mustv be code strip(boolean), ignore_quotes(boolean), convert(boolean), case(oneof([down,preserve,up])), functor(atom), arity(-nonneg), % actually ?nonneg match_arity(boolean) ]). :- predicate_options(csv_read_file/3, 3, [ pass_to(csv//2, 2), pass_to(phrase_from_file/3, 3) ]). :- predicate_options(csv_read_file_row/3, 3, [ line(-integer), pass_to(csv//2, 2), pass_to(open/4, 4) ]). :- predicate_options(csv_write_file/3, 3, [ pass_to(csv//2, 2), pass_to(open/4, 4) ]). :- predicate_options(csv_write_stream/3, 3, [ pass_to(csv//2, 2) ]). :- record csv_options(separator:integer=0',, strip:boolean=false, ignore_quotes:boolean=false, convert:boolean=true, case:oneof([down,preserve,up])=preserve, functor:atom=row, arity:integer, match_arity:boolean=true, skip_header:atom). %! csv_read_file(+File, -Rows) is det. %! csv_read_file(+File, -Rows, +Options) is det. % % Read a CSV file into a list of rows. Each row is a Prolog term % with the same arity. Options is handed to csv//2. Remaining % options are processed by phrase_from_file/3. The default % separator depends on the file name extension and is =|\t|= for % =|.tsv|= files and =|,|= otherwise. % % Suppose we want to create a predicate table/6 from a CSV file % that we know contains 6 fields per record. This can be done % using the code below. Without the option arity(6), this would % generate a predicate table/N, where N is the number of fields % per record in the data. % % == % ?- csv_read_file(File, Rows, [functor(table), arity(6)]), % maplist(assert, Rows). % == csv_read_file(File, Rows) :- csv_read_file(File, Rows, []). csv_read_file(File, Rows, Options) :- default_separator(File, Options, Options1), make_csv_options(Options1, Record, RestOptions), phrase_from_file(csv_roptions(Rows, Record), File, RestOptions). default_separator(File, Options0, Options) :- ( option(separator(_), Options0) -> Options = Options0 ; file_name_extension(_, Ext0, File), downcase_atom(Ext0, Ext), ext_separator(Ext, Sep) -> Options = [separator(Sep)|Options0] ; Options = Options0 ). ext_separator(csv, 0',). ext_separator(tsv, 0'\t). %! csv_read_stream(+Stream, -Rows, +Options) is det. % % Read CSV data from Stream. See also csv_read_row/3. csv_read_stream(Stream, Rows, Options) :- make_csv_options(Options, Record, _), phrase_from_stream(csv_roptions(Rows, Record), Stream). %! csv(?Rows)// is det. %! csv(?Rows, +Options)// is det. % % Prolog DCG to `read/write' CSV data. Options: % % * separator(+Code) % The comma-separator. Must be a character code. Default is % (of course) the comma. Character codes can be specified % using the 0' notation. E.g., using =|separator(0';)|= parses % a semicolon separated file. % % * ignore_quotes(+Boolean) % If =true= (default false), threat double quotes as a normal % character. % % * strip(+Boolean) % If =true= (default =false=), strip leading and trailing % blank space. RFC4180 says that blank space is part of the % data. % % * skip_header(+CommentLead) % Skip leading lines that start with CommentLead. There is % no standard for comments in CSV files, but some CSV files % have a header where each line starts with `#`. After % skipping comment lines this option causes csv//2 to skip empty % lines. Note that an empty line may not contain white space % characters (space or tab) as these may provide valid data. % % * convert(+Boolean) % If =true= (default), use name/2 on the field data. This % translates the field into a number if possible. % % * case(+Action) % If =down=, downcase atomic values. If =up=, upcase them % and if =preserve= (default), do not change the case. % % * functor(+Atom) % Functor to use for creating row terms. Default is =row=. % % * arity(?Arity) % Number of fields in each row. This predicate raises % a domain_error(row_arity(Expected), Found) if a row is % found with different arity. % % * match_arity(+Boolean) % If =false= (default =true=), do not reject CSV files where % lines provide a varying number of fields (columns). This % can be a work-around to use some incorrect CSV files. csv(Rows) --> csv(Rows, []). csv(Rows, Options) --> { make_csv_options(Options, Record, _) }, csv_roptions(Rows, Record). csv_roptions(Rows, Record) --> { ground(Rows) }, !, emit_csv(Rows, Record). csv_roptions(Rows, Record) --> skip_header(Record), csv_data(Rows, Record). skip_header(Options) --> { csv_options_skip_header(Options, CommentStart), nonvar(CommentStart), atom_codes(CommentStart, Codes) }, !, skip_header_lines(Codes), skip_blank_lines. skip_header(_) --> []. skip_header_lines(CommentStart) --> string(CommentStart), !, ( string(_Comment), end_of_record -> skip_header_lines(CommentStart) ). skip_header_lines(_) --> []. skip_blank_lines --> eos, !. skip_blank_lines --> end_of_record, !, skip_blank_lines. skip_blank_lines --> []. csv_data([], _) --> eos, !. csv_data([Row|More], Options) --> row(Row, Options), !, { debug(csv, 'Row: ~p', [Row]) }, csv_data(More, Options). row(Row, Options) --> fields(Fields, Options), { csv_options_functor(Options, Functor), Row =.. [Functor|Fields], functor(Row, _, Arity), check_arity(Options, Arity) }. check_arity(Options, Arity) :- csv_options_arity(Options, Arity), !. check_arity(Options, _) :- csv_options_match_arity(Options, false), !. check_arity(Options, Arity) :- csv_options_arity(Options, Expected), domain_error(row_arity(Expected), Arity). fields([F|T], Options) --> field(F, Options), ( separator(Options) -> fields(T, Options) ; end_of_record -> { T = [] } ). field(Value, Options) --> "\"", { csv_options_ignore_quotes(Options, false) }, !, string_codes(Codes), { make_value(Codes, Value, Options) }. field(Value, Options) --> { csv_options_strip(Options, true) }, !, stripped_field(Value, Options). field(Value, Options) --> { csv_options_separator(Options, Sep) }, field_codes(Codes, Sep), { make_value(Codes, Value, Options) }. stripped_field(Value, Options) --> ws, ( "\"", { csv_options_strip(Options, false) } -> string_codes(Codes), ws ; { csv_options_separator(Options, Sep) }, field_codes(Codes0, Sep), { strip_trailing_ws(Codes0, Codes) } ), { make_value(Codes, Value, Options) }. ws --> " ", !, ws. ws --> "\t", !, ws. ws --> "". strip_trailing_ws(List, Stripped) :- append(Stripped, WS, List), all_ws(WS). all_ws([]). all_ws([32|T]) :- all_ws(T). all_ws([9|T]) :- all_ws(T). %! string_codes(-Codes) % % Process a double-quotes string where the quote is escaped by % doubling it. Eats the terminating double-quote. string_codes(List) --> [H], ( { H == 0'" } -> ( "\"" -> { List = [H|T] }, string_codes(T) ; { List = [] } ) ; { List = [H|T] }, string_codes(T) ). field_codes([], Sep), [Sep] --> [Sep], !. field_codes([], _), "\n" --> "\r\n", !. field_codes([], _), "\n" --> "\n", !. field_codes([], _), "\n" --> "\r", !. field_codes([H|T], Sep) --> [H], !, field_codes(T, Sep). field_codes([], _) --> []. % unterminated last record %! make_value(+Codes, -Value, +Options) is det. % % Convert a list of character codes to the actual value, depending % on Options. make_value(Codes, Value, Options) :- csv_options_convert(Options, Convert), csv_options_case(Options, Case), make_value(Convert, Case, Codes, Value). make_value(true, preserve, Codes, Value) :- !, name(Value, Codes). make_value(true, Case, Codes, Value) :- !, ( number_string(Value, Codes) -> true ; make_value(false, Case, Codes, Value) ). make_value(false, preserve, Codes, Value) :- !, atom_codes(Value, Codes). make_value(false, down, Codes, Value) :- !, string_codes(String, Codes), downcase_atom(String, Value). make_value(false, up, Codes, Value) :- string_codes(String, Codes), upcase_atom(String, Value). separator(Options) --> { csv_options_separator(Options, Sep) }, [Sep]. end_of_record --> "\n". % Unix files end_of_record --> "\r\n". % DOS files end_of_record --> "\r". % MacOS files end_of_record --> eos. % unterminated last record %! csv_read_file_row(+File, -Row, +Options) is nondet. % % True when Row is a row in File. First unifies Row with the first % row in File. Backtracking yields the second, ... row. This % interface is an alternative to csv_read_file/3 that avoids % loading all rows in memory. Note that this interface does not % guarantee that all rows in File have the same arity. % % In addition to the options of csv_read_file/3, this predicate % processes the option: % % * line(-Line) % Line is unified with the 1-based line-number from which Row is % read. Note that Line is not the physical line, but rather the % _logical_ record number. csv_read_file_row(File, Row, Options) :- default_separator(File, Options, Options1), make_csv_options(Options1, RecordOptions, Options2), select_option(line(Line), Options2, RestOptions, _), setup_call_cleanup( open(File, read, Stream, RestOptions), csv_read_stream_row(Stream, Row, Line, RecordOptions), close(Stream)). csv_read_stream_row(Stream, Row, Line, Options) :- between(1, infinite, Line), ( csv_read_row(Stream, Row0, Options), Row0 \== end_of_file -> Row = Row0 ; !, fail ). %! csv_read_row(+Stream, -Row, +CompiledOptions) is det. % % Read the next CSV record from Stream and unify the result with Row. % CompiledOptions is created from options defined for csv//2 using % csv_options/2. Row is unified with `end_of_file` upon reaching the % end of the input. csv_read_row(Stream, Row, _Record) :- at_end_of_stream(Stream), !, Row = end_of_file. csv_read_row(Stream, Row, Record) :- read_lines_to_codes(Stream, Codes, Record, even), phrase(row(Row0, Record), Codes), !, Row = Row0. read_lines_to_codes(Stream, Codes, Options, QuoteQuantity) :- read_line_to_codes(Stream, Codes0), Codes0 \== end_of_file, ( ( csv_options_ignore_quotes(Options, true) ; check_quotes(Codes0, QuoteQuantity, even) ) -> Codes = Codes0 ; append(Codes0, [0'\n|Tail], Codes), read_lines_to_codes(Stream, Tail, Options, odd) ). check_quotes([], QuoteQuantity, QuoteQuantity) :- !. check_quotes([0'"|T], odd, Result) :- !, check_quotes(T, even, Result). check_quotes([0'"|T], even, Result) :- !, check_quotes(T, odd, Result). check_quotes([_|T], QuoteQuantity, Result) :- check_quotes(T, QuoteQuantity, Result). %! csv_options(-Compiled, +Options) is det. % % Compiled is the compiled representation of the CSV processing % options as they may be passed into csv//2, etc. This predicate is % used in combination with csv_read_row/3 to avoid repeated processing % of the options. csv_options(Compiled, Options) :- make_csv_options(Options, Compiled, _Ignored). /******************************* * OUTPUT * *******************************/ %! csv_write_file(+File, +Data) is det. %! csv_write_file(+File, +Data, +Options) is det. % % Write a list of Prolog terms to a CSV file. Options are given % to csv//2. Remaining options are given to open/4. The default % separator depends on the file name extension and is =|\t|= for % =|.tsv|= files and =|,|= otherwise. csv_write_file(File, Data) :- csv_write_file(File, Data, []). csv_write_file(File, Data, Options) :- must_be(list, Data), default_separator(File, Options, Options1), make_csv_options(Options1, OptionsRecord, RestOptions), setup_call_cleanup( open(File, write, Out, RestOptions), maplist(csv_write_row(Out, OptionsRecord), Data), close(Out)). csv_write_row(Out, OptionsRecord, Row) :- phrase(emit_row(Row, OptionsRecord), String), format(Out, '~s', [String]). emit_csv([], _) --> []. emit_csv([H|T], Options) --> emit_row(H, Options), emit_csv(T, Options). emit_row(Row, Options) --> { Row =.. [_|Fields] }, emit_fields(Fields, Options), "\r\n". % RFC 4180 demands \r\n emit_fields([], _) --> "". emit_fields([H|T], Options) --> emit_field(H, Options), ( { T == [] } -> [] ; { csv_options_separator(Options, Sep) }, [Sep], emit_fields(T, Options) ). emit_field(H, Options) --> { ( atom(H) -> atom_codes(H, Codes) ; string(H) -> string_codes(H, Codes) ) }, !, ( { needs_quotes(H, Options) } -> "\"", emit_string(Codes), "\"" ; emit_codes(Codes) ). emit_field([], _) --> !, { atom_codes('[]', Codes) }, emit_codes(Codes). emit_field(H, _) --> { number_codes(H,Codes) }, emit_codes(Codes). needs_quotes(Atom, _) :- sub_atom(Atom, _, _, _, '"'), !. needs_quotes(Atom, _) :- sub_atom(Atom, _, _, _, '\n'), !. needs_quotes(Atom, _) :- sub_atom(Atom, _, _, _, '\r'), !. needs_quotes(Atom, Options) :- csv_options_separator(Options, Sep), char_code(Char, Sep), sub_atom(Atom, _, _, _, Char), !. emit_string([]) --> "". emit_string([0'"|T]) --> !, "\"\"", emit_string(T). emit_string([H|T]) --> [H], emit_string(T). emit_codes([]) --> "". emit_codes([0'"|T]) --> !, "\"\"", emit_codes(T). emit_codes([H|T]) --> [H], emit_codes(T). %% csv_write_stream(+Stream, +Data, +Options) is det. % % Write the rows in Data to Stream. This is similar to % csv_write_file/3, but can deal with data that is produced % incrementally. The example below saves all answers from the % predicate data/3 to File. % % == % save_data(File) :- % setup_call_cleanup( % open(File, write, Out), % forall(data(C1,C2,C3), % csv_write_stream(Out, [row(C1,C2,C3)], [])), % close(Out)). % == csv_write_stream(Stream, Data, Options) :- must_be(list, Data), make_csv_options(Options, OptionsRecord, _), maplist(csv_write_row(Stream, OptionsRecord), Data).