/* Part of SWI-Prolog Author: Jan Wielemaker E-mail: J.Wielemaker@vu.nl WWW: http://www.swi-prolog.org Copyright (c) 2000-2020, University of Amsterdam VU University Amsterdam CWI, Amsterdam All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ :- module(sgml, [ load_html/3, % +Input, -DOM, +Options load_xml/3, % +Input, -DOM, +Options load_sgml/3, % +Input, -DOM, +Options load_sgml_file/2, % +File, -ListOfContent load_xml_file/2, % +File, -ListOfContent load_html_file/2, % +File, -Document load_structure/3, % +File, -Term, +Options load_dtd/2, % +DTD, +File load_dtd/3, % +DTD, +File, +Options dtd/2, % +Type, -DTD dtd_property/2, % +DTD, ?Property new_dtd/2, % +Doctype, -DTD free_dtd/1, % +DTD open_dtd/3, % +DTD, +Options, -Stream new_sgml_parser/2, % -Parser, +Options free_sgml_parser/1, % +Parser set_sgml_parser/2, % +Parser, +Options get_sgml_parser/2, % +Parser, +Options sgml_parse/2, % +Parser, +Options sgml_register_catalog_file/2, % +File, +StartOrEnd xml_quote_attribute/3, % +In, -Quoted, +Encoding xml_quote_cdata/3, % +In, -Quoted, +Encoding xml_quote_attribute/2, % +In, -Quoted xml_quote_cdata/2, % +In, -Quoted xml_name/1, % +In xml_name/2, % +In, +Encoding xsd_number_string/2, % ?Number, ?String xsd_time_string/3, % ?Term, ?Type, ?String xml_basechar/1, % +Code xml_ideographic/1, % +Code xml_combining_char/1, % +Code xml_digit/1, % +Code xml_extender/1, % +Code iri_xml_namespace/2, % +IRI, -Namespace iri_xml_namespace/3, % +IRI, -Namespace, -LocalName xml_is_dom/1 % +Term ]). :- autoload(library(error),[instantiation_error/1]). :- autoload(library(iostream),[open_any/5,close_any/1]). :- autoload(library(lists),[member/2,selectchk/3]). :- autoload(library(option),[select_option/3,merge_options/3]). :- meta_predicate load_structure(+, -, :), load_html(+, -, :), load_xml(+, -, :), load_sgml(+, -, :). :- predicate_options(load_structure/3, 3, [ charpos(integer), cdata(oneof([atom,string])), defaults(boolean), dialect(oneof([html,html4,html5,sgml,xhtml,xhtml5,xml,xmlns])), doctype(atom), dtd(any), encoding(oneof(['iso-8859-1', 'utf-8', 'us-ascii'])), entity(atom,atom), keep_prefix(boolean), file(atom), line(integer), offset(integer), number(oneof([token,integer])), qualify_attributes(boolean), shorttag(boolean), case_sensitive_attributes(boolean), case_preserving_attributes(boolean), system_entities(boolean), max_memory(integer), ignore_doctype(boolean), space(oneof([sgml,preserve,default,remove,strict])), xmlns(atom), xmlns(atom,atom), pass_to(sgml_parse/2, 2) ]). :- predicate_options(load_html/3, 3, [ pass_to(load_structure/3, 3) ]). :- predicate_options(load_xml/3, 3, [ pass_to(load_structure/3, 3) ]). :- predicate_options(load_sgml/3, 3, [ pass_to(load_structure/3, 3) ]). :- predicate_options(load_dtd/3, 3, [ dialect(oneof([sgml,xml,xmlns])), pass_to(open/4, 4) ]). :- predicate_options(sgml_parse/2, 2, [ call(oneof([begin,end,cdata,pi,decl,error,xmlns,urlns]), callable), cdata(oneof([atom,string])), content_length(integer), document(-any), max_errors(integer), parse(oneof([file,element,content,declaration,input])), source(any), syntax_errors(oneof([quiet,print,style])), xml_no_ns(oneof([error,quiet])) ]). :- predicate_options(new_sgml_parser/2, 2, [ dtd(any) ]). /** <module> SGML, XML and HTML parser This library allows you to parse SGML, XML and HTML data into a Prolog data structure. The library defines several families of predicates: $ High-level predicates : Most users will only use load_html/3, load_xml/3 or load_sgml/3 to parse arbitrary input into a _DOM_ structure. These predicates all call load_structure/3, which provides more options and may be used for processing non-standard documents. The DOM structure can be used by library(xpath) to extract information from the document. $ The low-level parser : The actual parser is written in C and consists of two parts: one for processing DTD (Document Type Definitions) and one for parsing data. The data can either be parsed to a Prolog (_DOM_) term or the parser can perform callbacks for the DOM _events_. $ Utility predicates : Finally, this library provides prmitives for classifying characters and strings according to the XML specification such as xml_name/1 to verify whether an atom is a valid XML name (identifier). It also provides primitives to quote attributes and CDATA elements. */ :- multifile user:file_search_path/2. :- dynamic user:file_search_path/2. user:file_search_path(dtd, '.'). user:file_search_path(dtd, swi('library/DTD')). sgml_register_catalog_file(File, Location) :- prolog_to_os_filename(File, OsFile), '_sgml_register_catalog_file'(OsFile, Location). :- use_foreign_library(foreign(sgml2pl)). register_catalog(Base) :- absolute_file_name(dtd(Base), [ extensions([soc]), access(read), file_errors(fail) ], SocFile), sgml_register_catalog_file(SocFile, end). :- initialization ignore(register_catalog('HTML4')). /******************************* * DTD HANDLING * *******************************/ /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Note that concurrent access to DTD objects is not allowed, and hence we will allocate and destroy them in each thread. Possibibly it would be nicer to find out why concurrent access to DTD's is flawed. It is diagnosed to mess with the entity resolution by Fabien Todescato. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ :- thread_local current_dtd/2. :- volatile current_dtd/2. :- thread_local registered_cleanup/0. :- volatile registered_cleanup/0. :- multifile dtd_alias/2. :- create_prolog_flag(html_dialect, html5, [type(atom)]). dtd_alias(html4, 'HTML4'). dtd_alias(html5, 'HTML5'). dtd_alias(html, DTD) :- current_prolog_flag(html_dialect, Dialect), dtd_alias(Dialect, DTD). %! dtd(+Type, -DTD) is det. % % DTD is a DTD object created from the file dtd(Type). Loaded DTD % objects are cached. Note that DTD objects may not be shared % between threads. Therefore, dtd/2 maintains the pool of DTD % objects using a thread_local predicate. DTD objects are % destroyed if a thread terminates. % % @error existence_error(source_sink, dtd(Type)) dtd(Type, DTD) :- current_dtd(Type, DTD), !. dtd(Type, DTD) :- new_dtd(Type, DTD), ( dtd_alias(Type, Base) -> true ; Base = Type ), absolute_file_name(dtd(Base), [ extensions([dtd]), access(read) ], DtdFile), load_dtd(DTD, DtdFile), register_cleanup, asserta(current_dtd(Type, DTD)). %! load_dtd(+DTD, +DtdFile, +Options) % % Load DtdFile into a DTD. Defined options are: % % * dialect(+Dialect) % Dialect to use (xml, xmlns, sgml) % % * encoding(+Encoding) % Encoding of DTD file % % @param DTD is a fresh DTD object, normally created using % new_dtd/1. load_dtd(DTD, DtdFile) :- load_dtd(DTD, DtdFile, []). load_dtd(DTD, DtdFile, Options) :- sgml_open_options(sgml:Options, OpenOptions, sgml:DTDOptions), setup_call_cleanup( open_dtd(DTD, DTDOptions, DtdOut), setup_call_cleanup( open(DtdFile, read, DtdIn, OpenOptions), copy_stream_data(DtdIn, DtdOut), close(DtdIn)), close(DtdOut)). %! destroy_dtds % % Destroy DTDs cached by this thread as they will become % unreachable anyway. destroy_dtds :- ( current_dtd(_Type, DTD), free_dtd(DTD), fail ; true ). %! register_cleanup % % Register cleanup of DTDs created for this thread. register_cleanup :- registered_cleanup, !. register_cleanup :- ( thread_self(main) -> at_halt(destroy_dtds) ; current_prolog_flag(threads, true) -> prolog_listen(this_thread_exit, destroy_dtds) ; true ), assert(registered_cleanup). /******************************* * EXAMINE DTD * *******************************/ prop(doctype(_), _). prop(elements(_), _). prop(entities(_), _). prop(notations(_), _). prop(entity(E, _), DTD) :- ( nonvar(E) -> true ; '$dtd_property'(DTD, entities(EL)), member(E, EL) ). prop(element(E, _, _), DTD) :- ( nonvar(E) -> true ; '$dtd_property'(DTD, elements(EL)), member(E, EL) ). prop(attributes(E, _), DTD) :- ( nonvar(E) -> true ; '$dtd_property'(DTD, elements(EL)), member(E, EL) ). prop(attribute(E, A, _, _), DTD) :- ( nonvar(E) -> true ; '$dtd_property'(DTD, elements(EL)), member(E, EL) ), ( nonvar(A) -> true ; '$dtd_property'(DTD, attributes(E, AL)), member(A, AL) ). prop(notation(N, _), DTD) :- ( nonvar(N) -> true ; '$dtd_property'(DTD, notations(NL)), member(N, NL) ). dtd_property(DTD, Prop) :- prop(Prop, DTD), '$dtd_property'(DTD, Prop). /******************************* * SGML * *******************************/ %! load_structure(+Source, -ListOfContent, :Options) is det. % % Parse Source and return the resulting structure in % ListOfContent. Source is handed to open_any/5, which allows for % processing an extensible set of input sources. % % A proper XML document contains only a single toplevel element % whose name matches the document type. Nevertheless, a list is % returned for consistency with the representation of element % content. % % The encoding(+Encoding) option is treated special for % compatibility reasons: % % - If `Encoding` is one of =iso-8859-1=, =us-ascii= or =utf-8=, % the stream is opened in binary mode and the option is passed % to the SGML parser. % - If `Encoding` is present, but not one of the above, the % stream is opened in text mode using the given encoding. % - Otherwise (no `Encoding`), the stream is opened in binary % mode and doing the correct decoding is left to the parser. load_structure(Spec, DOM, Options) :- sgml_open_options(Options, OpenOptions, SGMLOptions), setup_call_cleanup( open_any(Spec, read, In, Close, OpenOptions), load_structure_from_stream(In, DOM, SGMLOptions), close_any(Close)). sgml_open_options(Options, OpenOptions, SGMLOptions) :- Options = M:Plain, ( select_option(encoding(Encoding), Plain, NoEnc) -> ( sgml_encoding(Encoding) -> merge_options(NoEnc, [type(binary)], OpenOptions), SGMLOptions = Options ; OpenOptions = Plain, SGMLOptions = M:NoEnc ) ; merge_options(Plain, [type(binary)], OpenOptions), SGMLOptions = Options ). sgml_encoding(Enc) :- downcase_atom(Enc, Enc1), sgml_encoding_l(Enc1). sgml_encoding_l('iso-8859-1'). sgml_encoding_l('us-ascii'). sgml_encoding_l('utf-8'). sgml_encoding_l('utf8'). sgml_encoding_l('iso_latin_1'). sgml_encoding_l('ascii'). load_structure_from_stream(In, Term, M:Options) :- ( select_option(dtd(DTD), Options, Options1) -> ExplicitDTD = true ; ExplicitDTD = false, Options1 = Options ), move_front(Options1, dialect(_), Options2), % dialect sets defaults setup_call_cleanup( new_sgml_parser(Parser, [ dtd(DTD) ]), parse(Parser, M:Options2, TermRead, In), free_sgml_parser(Parser)), ( ExplicitDTD == true -> ( DTD = dtd(_, DocType), dtd_property(DTD, doctype(DocType)) -> true ; true ) ; free_dtd(DTD) ), Term = TermRead. move_front(Options0, Opt, Options) :- selectchk(Opt, Options0, Options1), !, Options = [Opt|Options1]. move_front(Options, _, Options). parse(Parser, M:Options, Document, In) :- set_parser_options(Options, Parser, In, Options1), parser_meta_options(Options1, M, Options2), set_input_location(Parser, In), sgml_parse(Parser, [ document(Document), source(In) | Options2 ]). set_parser_options([], _, _, []). set_parser_options([H|T], Parser, In, Rest) :- ( set_parser_option(H, Parser, In) -> set_parser_options(T, Parser, In, Rest) ; Rest = [H|R2], set_parser_options(T, Parser, In, R2) ). set_parser_option(Var, _Parser, _In) :- var(Var), !, instantiation_error(Var). set_parser_option(Option, Parser, _) :- def_entity(Option, Parser), !. set_parser_option(offset(Offset), _Parser, In) :- !, seek(In, Offset, bof, _). set_parser_option(Option, Parser, _In) :- parser_option(Option), !, set_sgml_parser(Parser, Option). set_parser_option(Name=Value, Parser, In) :- Option =.. [Name,Value], set_parser_option(Option, Parser, In). parser_option(dialect(_)). parser_option(shorttag(_)). parser_option(case_sensitive_attributes(_)). parser_option(case_preserving_attributes(_)). parser_option(system_entities(_)). parser_option(max_memory(_)). parser_option(ignore_doctype(_)). parser_option(file(_)). parser_option(line(_)). parser_option(space(_)). parser_option(number(_)). parser_option(defaults(_)). parser_option(doctype(_)). parser_option(qualify_attributes(_)). parser_option(encoding(_)). parser_option(keep_prefix(_)). def_entity(entity(Name, Value), Parser) :- get_sgml_parser(Parser, dtd(DTD)), xml_quote_attribute(Value, QValue), setup_call_cleanup(open_dtd(DTD, [], Stream), format(Stream, '<!ENTITY ~w "~w">~n', [Name, QValue]), close(Stream)). def_entity(xmlns(URI), Parser) :- set_sgml_parser(Parser, xmlns(URI)). def_entity(xmlns(NS, URI), Parser) :- set_sgml_parser(Parser, xmlns(NS, URI)). %! parser_meta_options(+Options0, +Module, -Options) % % Qualify meta-calling options to the parser. parser_meta_options([], _, []). parser_meta_options([call(When, Closure)|T0], M, [call(When, M:Closure)|T]) :- !, parser_meta_options(T0, M, T). parser_meta_options([H|T0], M, [H|T]) :- parser_meta_options(T0, M, T). %! set_input_location(+Parser, +In:stream) is det. % % Set the input location if this was not set explicitly set_input_location(Parser, _In) :- get_sgml_parser(Parser, file(_)), !. set_input_location(Parser, In) :- stream_property(In, file_name(File)), !, set_sgml_parser(Parser, file(File)), stream_property(In, position(Pos)), set_sgml_parser(Parser, position(Pos)). set_input_location(_, _). /******************************* * UTILITIES * *******************************/ %! load_sgml_file(+File, -DOM) is det. % % Load SGML from File and unify the resulting DOM structure with % DOM. % % @deprecated New code should use load_sgml/3. load_sgml_file(File, Term) :- load_sgml(File, Term, []). %! load_xml_file(+File, -DOM) is det. % % Load XML from File and unify the resulting DOM structure with % DOM. % % @deprecated New code should use load_xml/3. load_xml_file(File, Term) :- load_xml(File, Term, []). %! load_html_file(+File, -DOM) is det. % % Load HTML from File and unify the resulting DOM structure with % DOM. % % @deprecated New code should use load_html/3. load_html_file(File, DOM) :- load_html(File, DOM, []). %! load_html(+Input, -DOM, +Options) is det. % % Load HTML text from Input and unify the resulting DOM structure % with DOM. Options are passed to load_structure/3, after adding % the following default options: % % - dtd(DTD) % Pass the DTD for HTML as obtained using dtd(html, DTD). % - dialect(Dialect) % Current dialect from the Prolog flag =html_dialect= % - max_errors(-1) % - syntax_errors(quiet) % Most HTML encountered in the wild contains errors. Even in the % context of errors, the resulting DOM term is often a % reasonable guess at the intent of the author. % % You may also want to use the library(http/http_open) to support % loading from HTTP and HTTPS URLs. For example: % % == % :- use_module(library(http/http_open)). % :- use_module(library(sgml)). % % load_html_url(URL, DOM) :- % load_html(URL, DOM, []). % == load_html(File, Term, M:Options) :- current_prolog_flag(html_dialect, Dialect), dtd(Dialect, DTD), merge_options(Options, [ dtd(DTD), dialect(Dialect), max_errors(-1), syntax_errors(quiet) ], Options1), load_structure(File, Term, M:Options1). %! load_xml(+Input, -DOM, +Options) is det. % % Load XML text from Input and unify the resulting DOM structure % with DOM. Options are passed to load_structure/3, after adding % the following default options: % % - dialect(xml) load_xml(Input, DOM, M:Options) :- merge_options(Options, [ dialect(xml) ], Options1), load_structure(Input, DOM, M:Options1). %! load_sgml(+Input, -DOM, +Options) is det. % % Load SGML text from Input and unify the resulting DOM structure % with DOM. Options are passed to load_structure/3, after adding % the following default options: % % - dialect(sgml) load_sgml(Input, DOM, M:Options) :- merge_options(Options, [ dialect(sgml) ], Options1), load_structure(Input, DOM, M:Options1). /******************************* * ENCODING * *******************************/ %! xml_quote_attribute(+In, -Quoted) is det. %! xml_quote_cdata(+In, -Quoted) is det. % % Backward compatibility for versions that allow to specify % encoding. All characters that cannot fit the encoding are mapped % to XML character entities (&#dd;). Using ASCII is the safest % value. xml_quote_attribute(In, Quoted) :- xml_quote_attribute(In, Quoted, ascii). xml_quote_cdata(In, Quoted) :- xml_quote_cdata(In, Quoted, ascii). %! xml_name(+Atom) is semidet. % % True if Atom is a valid XML name. xml_name(In) :- xml_name(In, ascii). /******************************* * XML CHARACTER CLASSES * *******************************/ %! xml_basechar(+CodeOrChar) is semidet. %! xml_ideographic(+CodeOrChar) is semidet. %! xml_combining_char(+CodeOrChar) is semidet. %! xml_digit(+CodeOrChar) is semidet. %! xml_extender(+CodeOrChar) is semidet. % % XML character classification predicates. Each of these % predicates accept both a character (one-character atom) and a % code (integer). % % @see http://www.w3.org/TR/2006/REC-xml-20060816 /******************************* * TYPE CHECKING * *******************************/ %! xml_is_dom(@Term) is semidet. % % True if term statisfies the structure as returned by % load_structure/3 and friends. xml_is_dom(0) :- !, fail. % catch variables xml_is_dom(List) :- is_list(List), !, xml_is_content_list(List). xml_is_dom(Term) :- xml_is_element(Term). xml_is_content_list([]). xml_is_content_list([H|T]) :- xml_is_content(H), xml_is_content_list(T). xml_is_content(0) :- !, fail. xml_is_content(pi(Pi)) :- !, atom(Pi). xml_is_content(CDATA) :- atom(CDATA), !. xml_is_content(CDATA) :- string(CDATA), !. xml_is_content(Term) :- xml_is_element(Term). xml_is_element(element(Name, Attributes, Content)) :- dom_name(Name), dom_attributes(Attributes), xml_is_content_list(Content). dom_name(NS:Local) :- atom(NS), atom(Local), !. dom_name(Local) :- atom(Local). dom_attributes(0) :- !, fail. dom_attributes([]). dom_attributes([H|T]) :- dom_attribute(H), dom_attributes(T). dom_attribute(Name=Value) :- dom_name(Name), atomic(Value). /******************************* * MESSAGES * *******************************/ :- multifile prolog:message/3. % Catch messages. sgml/4 is generated by the SGML2PL binding. prolog:message(sgml(Parser, File, Line, Message)) --> { get_sgml_parser(Parser, dialect(Dialect)) }, [ 'SGML2PL(~w): ~w:~w: ~w'-[Dialect, File, Line, Message] ]. /******************************* * XREF SUPPORT * *******************************/ :- multifile prolog:called_by/2. prolog:called_by(sgml_parse(_, Options), Called) :- findall(Meta, meta_call_term(_, Meta, Options), Called). meta_call_term(T, G+N, Options) :- T = call(Event, G), pmember(T, Options), call_params(Event, Term), functor(Term, _, N). pmember(X, List) :- % member for partial lists nonvar(List), List = [H|T], ( X = H ; pmember(X, T) ). call_params(begin, begin(tag,attributes,parser)). call_params(end, end(tag,parser)). call_params(cdata, cdata(cdata,parser)). call_params(pi, pi(cdata,parser)). call_params(decl, decl(cdata,parser)). call_params(error, error(severity,message,parser)). call_params(xmlns, xmlns(namespace,url,parser)). call_params(urlns, urlns(url,url,parser)). /******************************* * SANDBOX * *******************************/ :- multifile sandbox:safe_primitive/1, sandbox:safe_meta_predicate/1. sandbox:safe_meta_predicate(sgml:load_structure/3). sandbox:safe_primitive(sgml:dtd(Dialect, _)) :- dtd_alias(Dialect, _). sandbox:safe_primitive(sgml:xml_quote_attribute(_,_,_)). sandbox:safe_primitive(sgml:xml_quote_cdata(_,_,_)). sandbox:safe_primitive(sgml:xml_name(_,_)). sandbox:safe_primitive(sgml:xml_basechar(_)). sandbox:safe_primitive(sgml:xml_ideographic(_)). sandbox:safe_primitive(sgml:xml_combining_char(_)). sandbox:safe_primitive(sgml:xml_digit(_)). sandbox:safe_primitive(sgml:xml_extender(_)). sandbox:safe_primitive(sgml:iri_xml_namespace(_,_,_)). sandbox:safe_primitive(sgml:xsd_number_string(_,_)). sandbox:safe_primitive(sgml:xsd_time_string(_,_,_)).