\documentclass[11pt]{article}
\usepackage{pl}
\usepackage{html}

\onefile
\htmloutput{.}					% Output directory
\htmlmainfile{table}				% Main document file
\bodycolor{white}				% Page colour

\begin{document}

\title{Managing external tables for SWI-Prolog}
\author{Jan Wielemaker \\
	Human Computer Studies (HCS), \\
	University of Amsterdam \\
	The Netherlands \\
	E-mail: \email{J.Wielemaker@uva.nl}}

\maketitle

\begin{abstract}
This document describes a foreign language extension to
\href{http://www.swi-prolog.org}{SWI-Prolog} for the
manipulation of `external tables'. External tables are files using a
textual representation of records separated into fields. The package
allows for a flexible definition of the format of the file in terms of
records and fields, how the information in the file should be mapped
onto Prolog data types and what properties the file has to improve the
performance of lookup.

The table package has been used successfully to deal with large static
databases such as dictionaries. Compared to loading the tables into the
Prolog database, this approach required much less memory and loads much
faster while providing reasonable lookup-performance on sorted tables.

This package uses read-only `mapping' of the database file into
memory and is ported to Win32 (Windows 95 and NT) as well as Unix
systems providing the mmap() system call (Solaris, SunOs, Linux and
many more modern Unices).
\end{abstract}

\vfill

\tableofcontents

\newpage

\section{Introduction}
\label{sec:table-intro}

Prolog programs sometimes need access to large sets of background data.
For example in the {\sc grasp} project we need access to ontologies of
art objects, a large lexicon and translation dictionaries.  Storage
of such information as Prolog clauses is not sufficiently efficient
in terms of the memory requirements.

The table package outlined in this document allows for easy access of
large structured files.  The package uses binary search if possible and
linear search for queries that cannot use more efficient algorithms
without building additional index tables.  Caching is achieved using the
file-to-memory maps supported by many modern operating systems.

The following sections define the interface predicates for the package.
\Secref{example} provides an example to access the Unix password file.

\section{Managing external tables}
\label{sec:table-external}

\subsection{Creating and destroying tables}
\label{sec:table-create-destroy}

This section describes the predicates required for creating and
destroying the access to external database tables.


\begin{description}
\predicate{new_table}{4}{+File, +Columns, +Options, -Handle}
    Create a description of a new table, stored in \arg{File}. \arg{Columns} is	a list of descriptions for each column.  A column
    description is of the form

    \begin{quote}
    \arg{ColumnName}{\tt (}\arg{Type [, ColumnOptions]}{\tt)}
    \end{quote}

    \arg{Type} denotes the Prolog type to which the field should be
    converted and is one of:

    \begin{center}
    \begin{tabular}{|l|p{3in}|}
    \hline
    \tt integer		& Convert to a Prolog integer.  The input is
			  treated as a decimal number. \\
    \tt hexadecimal	& Convert to a Prolog integer.  The input is
			  treated as a hex number. \\
    \tt float		& Convert to a Prolog floating point number.
			  The input is handled by the C-library function
			  {\tt strtod()}. \\
    \tt atom		& Convert to a Prolog atom. \\
    \tt string		& Convert to a SWI-Prolog string object. \\
    \tt code_list	& Convert to a list of {\sc ascii} codes. \\
    \hline
    \end{tabular}
    \end{center}

    \arg{ColumnOptions} is a list of additional properties of the
    column.  Supported values are:

    \begin{center}
    \begin{tabular}{|l|p{3in}|}
    \hline
    \tt sorted		& The field is strictly sorted, but may have
			  (adjacent) duplicate entries.  If the field
			  is textual, it should be sorted
			  alphabetically, otherwise it should be sorted
			  numerically. \\
    \tt sorted(+\arg{Table}) & The (textual) field is sorted using the
			  ordering declared by the named {\em ordering
			  table}.  This option may be used to define
			  reverse order, `dictionary' order or other
			  irregular alphabetical ordering. See
			  new_order_table/2. \\
    \tt unique		& This column has distinct values for each
			  row in the table.  \\
    \tt downcase	& Map all uppercase in the field to lowercase
			  before converting to a Prolog atom, string or
			  code_list. \\
    \tt map_space_to_underscore & Map spaces to underscores before
			  converting to a Prolog atom, string or
			  code_list. \\
    \tt syntax		& For numerical fields.  If the field does not
			  contain a valid number, matching the value
			  fails.  Reading the value returns the value
			  as an atom. \\
    \tt width(+\arg{Chars}) & Field has fixed width of the specified
			  number of characters.  The column-separator
			  is not considered for this column. \\
    \tt arg(+\arg{Index}) & For read_table_record/4, unify the field
			  with the given argument of the record term.
			  Further fields will be assigned index+1,
			  \ldots. \\
    \tt skip		& Don't convert this field to Prolog.  The
			  field is simply skipped without checking
			  for consistency. \\
    \hline
    \end{tabular}
    \end{center}

    The \arg{Options} argument is a list of global options for the
    table.  Defined options are:

    \begin{center}
    \begin{tabular}{|l|p{3in}|}
    \hline
    \tt record_separator(+\arg{Code}) & Character ({\sc ascii}) value
			  of the character separating two records.
			  Default is the newline ({\sc ascii} 10). \\
    \tt field_separator(+\arg{Code}) &  Character ({\sc ascii}) value
			  of the character separating two fields in
			  a record. Default is the space ({\sc ascii}
			  32), which also has a special meaning.  Two
			  fields separated by a space may be separated
			  by any non-empty sequence of spaces and tab
			  ({\sc ascii} 9) characters.  For all other
			  separators, a single character separates the
			  fields. \\
    \tt encoding(+\arg{Encoding}) & Text encoding of the file.  Values
			  are \const{iso_latin_1} (default),
			  \const{utf8} or \const{native}.  The latter
			  uses the native multibyte to unicode
			  conversion. \\
    \tt escape(+\arg{Code}, +\arg{ListOfMap}) & Sometimes, a table
			  defines escape sequences to make it possible
			  to use the separator-characters in
			  text-fields.  This options provides a simple
			  way to handle some standard cases.  \arg{Code}
			  is the {\sc ascii} code of the character that
			  leads the escape sequence.  The default is
			  {\tt -1}, and thus never matched.
			  \arg{ListOfMap} is a list of
			  \arg{From}{\tt{} = }\arg{To} character
			  mappings. The default map table is the
			  identity map, unless \arg{Code} refers to the
			  \verb$\$ character, in which case
			  \verb$\b$, \verb$\e$, \verb$\n$, \verb$\r$
			  and \verb$\t$ have their usual meaning. \\
    \tt functor(\arg{+Head}) & Functor used by read_table_record/4.
			  Default is {\tt record} using the maximal
			  argument index of the fields as arity. \\
    \hline
    \end{tabular}
    \end{center}

    If the options are parsed successfully, \arg{Handle} is unified
    with a term that may be used as a handle to the table for future
    operations on it.  Note that new_table/4 does not access the file
    system, so its success only indicates the description could be
    parsed, not the presence, access or format of the file.

\predicate{open_table}{1}{+Handle}
    Open the table.  This predicate normally does not need to be called
    explicitely, as all operations on the table handle will
    automatically open the table if this is required.  It fails if the
    file cannot be accessed or some other error with the required
    operating-system resources occurs.  The contents of the file is
    not examined by this predicate.

\predicate{close_table}{1}{+Handle}
    Close the file and other system resources, but do not remove the
    description of the table, so it can be re-opened later.

\predicate{free_table}{1}{+Handle}
    Close and remove the handle.  After this operation, \arg{Handle}
    becomes invalid and further references to it causes undefined
    behaviour.

\end{description}

\subsection{Accessing a table}
\label{sec:table-access}

This section describes the predicates to read data from a table.

\subsubsection{Finding record locations in a table}
\label{sec:table-find-record-location}

Records are addressed by their offset in the table (file). As records
have generally non-fixed length, searching is often required.  The
predicates below allow for finding records in the file.

\begin{description}
\predicate{get_table_attribute}{3}{+Handle, +Attribute, -Value}
    Fetch attributes of the table.  Defined attributes:

    \begin{tabular}{lp{3in}}
    \tt file	& Unify value with the name of the file with which
		  the table is associated. \\
    \tt field(\arg{N}) &
		  Unify value with declaration of n-th (1-based) field. \\
    \tt field_separator &
		  Unify value with the field separator character. \\
    \tt record_separator &
		  Unify value with the record separator character. \\
    \tt key_field &
		  Unify value with the 1-based index of the field that is
		  sorted or fails if the table contains no sorted fields. \\
    \tt field_count &
		  Unify value with the total number of columns in the table. \\
    \tt size	& Unify value with the number of characters
                  in the table-file, {\bf not} the number of records. \\
    \tt window  & Unify value with a term \arg{Start}{\tt{} - }\arg{Size}, indicating the properties of the current
		  window. \\
    \end{tabular}
\predicate{table_window}{3}{+Handle, +Start, +Size}
    If only part of the file represents the table, this call may be used
    to define a window on the file. \arg{Start} defines the start of the
    window relative to the start of the file.  \arg{Size} is the size in
    characters.  Skipping a header is one of the possible purposes for
    this call.
\predicate{table_start_of_record}{4}{+Handle, +From, +To, -Start}
    Enumerates (on backtracking) the start of records in the table
    in the region [From, To).  Together with read_table_record/4,
    this may be used to read the table's data.
\predicate{table_previous_record}{3}{+Handle, +Here, -Previous}
    If \arg{Here} is the start of a record, find the start of the
    record before it.  If \arg{Here} points at an arbitrary location
    in a record, the start of this record will be returned.
\end{description}

\subsubsection{Reading records}
\label{sec:table-read-record}

There are two predicates for reading records. The read_table_record/4
reads an entire record, while read_table_fields/4 reads one or more
fields from a record.

\begin{description}
\predicate{read_table_record}{4}{+Handle, +Start, -Next, -Record}
    Read a record from the table. \arg{Handle} is a handle as returned
    by new_table/4. \arg{Start} is the location of a record. If \arg{Start} does not point to the start of a record, this predicate
    searches backwards for the starting position. \arg{Record} is
    unified with a term constructed from the \arg{functor} associated
    with the table (default name {\tt record} and arity the number of
    not-skipped columns), each of the arguments containing the converted
    data. An error is raised if the data could not be converted. \arg{Next} is unified with the start position for the next record.

\predicate{read_table_fields}{4}{+Handle, +Start, -Next, -Fields}
    As read_table_record/4, but \arg{Fields} is a list of terms
    \arg{+Name}(-\arg{Value}), and the \arg{Values} will be unified
    with the values of the specified field.

\predicate{read_table_record_data}{4}{+Handle, +Start, -Next, -Record}
    Similar to read_table_record/4, but unifies record with a Prolog
    string containing the data of the record unparsed.  The returned
    record does {\bf not} contain the terminating record-separator.

\end{description}

\subsubsection{Searching the table}
\label{sec:table-search}

\begin{description}
\predicate{in_table}{3}{+Handle, ?Fields, -RecordPos}
    Searches the table for records matching \arg{Fields}.  If a match
    is found, the variable (see below) fields in \arg{Fields} are
    unified with the corresponding field value, and \arg{RecordPos}
    is unified with the position of the record.  The latter handle may
    be used in a subsequent call to read_table_record/4 or
    read_table_fields/4.

    \arg{Fields} is a list of field specifiers.  Each specifier is of
    the format:

    \begin{quote}
    \arg{FieldName}(\arg{Value} [, \arg{Options}])
    \end{quote}

    \arg{Options} is a list of options to specify the search. By
    default, the package will search for an exact match, possibly
    using the ordering table associated with the field (see {\tt order}
    option in new_table/4).  Options are:

    \begin{center}
    \begin{tabular}{|l|p{3in}|}
    \hline
    \tt prefix		& Uses prefix search with the default table. \\
    \tt prefix(\arg{Table}) & Uses prefix search with the specified
			      ordering table. \\
    \tt substring	& Searches for a substring in the field.  This
			  requires linear search of the table. \\
    \tt substring(\arg{Table}) & Searches for a substring, using the
			      table information for determining the
			      equivalence of characters. \\
    \tt =		& Default equivalence. \\
    \tt =(\arg{Table})  & Equivalence using the given table. \\
    \hline
    \end{tabular}
    \end{center}

    If \arg{Value} is unbound (i.e.\ a variable), the record is
    considered not specified.  The possible option list is ignored.
    If a match is found on the remaining fields, the variable is unified
    with the value found in the field.

    First, the system checks whether there is an ordered field that is
    specified.  In this case, binary search is employed to find the
    matching record(s).  Otherwise, linear search is used.

    If the match contains a specified field that has the property
    {\tt unique} set (see new_table/4), in_table/3 succeeds
    deterministically.  Otherwise it will create a backtrack-point and
    backtracking will yield further solutions to the query.

    in_table/3 may be comfortable used to bind the table transparently
    to a predicate. For example, we have a file with lines of the
    format.\footnote{This is the {\tt disproot.dat} table from the
		     {\sc aat} database used in {\sc grasp}}

    \begin{code}
    C1C2,Full Name
    \end{code}

    \arg{C1C2} is a two-character identifier used in the other tables,
    and \arg{FullName} is the description of the identifier.  We want
    to have a predicate identifier_name(?Id, ?FullName) to reflect this
    table.  The code below does the trick:

    \begin{code}
    :- dynamic stored_idtable_handle/1.


    idtable(Handle) :-
            stored_idtable_handle(Handle).
    idtable(Handle) :-
	    new_table('disproot.dat',
		      [ id(atom, [downcase, sorted, unique]),
			name(atom)
		      ],
		      [ field_separator(0',)
		      ], Handle),
	    assert(stored_idtable_handle(Handle)).

    identifier_name(Id, Name) :-
	    idtable(Handle),
	    in_table(Handle, [id(Id), name(Name)], _).
    \end{code}
\end{description}

\subsubsection{Miscellaneous}
\label{sec:table-misc}

\begin{description}
    \predicate{table_version}{2}{-Version, -CompileDate}
Unify \arg{Version} with an atom identifying the version of this
package, and \arg{CompileDate} with the date this package was compiled.
\end{description}

\section{Flexible ordering and equivalence based on character table}
\label{sec:table-ordering}

This package was developed as part of the {\sc grasp} project, where it
is used for browsing lexical and ontology information, which is normally
stored using `dictionary' order, rather than the more conventional
alphabetical ordering based on character codes. To achieve programmable
ordering, the table package defines `order tables'.  An order table is
a table with the cardinality of the size of the character set (256 for
extended {\sc ascii}), and maps each character onto its `order number',
and some characters onto special codes.

The default ({\tt exact}) table matches all character codes onto
themselves.  The default {\tt case_insensitive} table matches all
uppercase characters onto their corresponding lowercase character.
The tables {\tt iso_latin_1} and {\tt iso_latin_1_case_insensitive}
map the ISO-latin-1 letters with diacritics into their plain
counterpart.

To support dictionary ordering, the following special categories are
defined:

\begin{center}
\begin{tabular}{|l|p{3in}|}
\hline
ignore &	Characters of the ignore set are simple discarded from
		the input. \\
break &		Characters from the break set are treated as word-breaks,
		and each non-empty sequence of them is considered equal.
		A word break precedes a normal character. \\
tag &		Characters of type tag indicate the start of a `tag'
		that should not be considered in ordering, unless both
		strings are the same upto the tag. \\
\hline
\end{tabular}
\end{center}

The following predicates are defined to manage and use these tables:

\begin{description}
\predicate{new_order_table}{2}{+Name, +Options}
    Create a new, or replace the order-table with the given name (an
    atom).  \arg{Options} is a list of options:

    \begin{center}
    \begin{tabular}{|l|p{3in}|}
    \hline
    \tt case_insensitive	& Map all upper- to lowercase characters. \\
    \tt iso_latin_1		& Start with an ISO-Latin-1 table \\
    \tt iso_latin_1_case_insensitive & Start with a case-insensitive ISO-Latin-1 table \\
    \tt copy(+\arg{Table})	& Copy all entries from \arg{Table}. \\
    \tt tag(+\arg{ListOfCodes}) & Add these characters to the set of
				  `tag' characters. \\
    \tt ignore(+\arg{ListOfCodes}) & Add these characters to the set of
				  `ignore' characters. \\
    \tt break(+\arg{ListOfCodes}) & Add these characters to the set of
				  `break' characters. \\
    \tt +\arg{Code1} = +\arg{Code2} & Map \arg{Code1} onto \arg{Code2}. \\
    \hline
    \end{tabular}
    \end{center}
\predicate{order_table_mapping}{3}{+Table, ?From, ?To}
    Read the current mapping.  \arg{To} is a character code or one of
    the atoms \const{break}, \const{ignore} or \const{tag}.
\predicate{compare_strings}{4}{+Table, +S1, +S2, -Result}
    Compare two strings using the named \arg{Table}.  \arg{S1} and
    \arg{S2} may be atoms, strings or code-lists. \arg{Result} is one
    of the atoms \verb$<$, \verb$=$ or \verb$>$.
\predicate{prefix_string}{3}{+Table, +Prefix, +String}
    Succeeds if \arg{Prefix} is a prefix of \arg{String} using the named
    \arg{Table}.
\predicate{prefix_string}{4}{+Table, +Prefix, -Rest, +String}
    Succeeds if \arg{Prefix} is a prefix of \arg{String} using the named
    \arg{Table}, and \arg{Rest} is unified with the remainder of
    \arg{String} that is not matched.  Please note that the existence of
    an order-table implies simple contatenation using atom_concat/3
    cannot be used to determine the non-matched part of the string.
    \predicate{sub_string}{3}{+Table, +Sub, +String} Succeeds if
    \arg{Sub} is a substring of \arg{String} using the named
    \arg{Table}.
\end{description}

\section{Example: accessing the Unix passwd file}	\label{sec:example}

The Unix passwd file is a file with records spanning a single line each.
The fields are separated by a single `:' character.  Here is an example
of a line:

\begin{code}
joe:hgdu3r3bce:53:100:Joe Johnson:/users/joe:/bin/bash
\end{code}

The following call defines a table for it:

\begin{code}
?- new_table('/etc/passwd',
	     [ user(atom),
	       passwd(code_list),
	       uid(integer),
	       gid(integer),
	       gecos(code_list),
	       homedir(atom),
	       shell(atom)
	     ],
	     [ field_separator(0':)
	     ],
	     H).
\end{code}

To find all people of group \arg{100}, use:

\begin{code}
?- findall(User, in_table(H, [user(User), gid(100)], _), Users).
\end{code}

\end{document}