/* Part of SWI-Prolog Author: Jan Wielemaker E-mail: J.Wielemaker@vu.nl WWW: http://www.swi-prolog.org Copyright (c) 2011-2023, University of Amsterdam VU University Amsterdam CWI, Amsterdam SWI-Prolog Solutions b.v. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef __WINDOWS__ #include #endif #include "pl-incl.h" #include "pl-ctype.h" #include "pl-utf8.h" #include "../pl-fli.h" #ifdef HAVE_UNISTD_H #include #endif #ifdef O_XOS # include "windows/dirent.h" #else #if HAVE_DIRENT_H # include #else # define dirent direct # if HAVE_SYS_NDIR_H # include # endif # if HAVE_SYS_DIR_H # include # endif # if HAVE_NDIR_H # include # endif #endif #endif /*O_XOS*/ #ifdef HAVE_SYS_STAT_H #include #endif #ifdef HAVE_SYS_PARAM_H #include #endif #define O_EXPANDS_TESTS_EXISTS 1 #ifndef IS_DIR_SEPARATOR #define IS_DIR_SEPARATOR(c) ((c) == '/') #endif /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Unix Wildcard Matching. Recognised: ? matches one arbitrary character * matches any number of any character [xa-z] matches x and a-z {p1,p2} matches pattern p1 or p2 backslash (\) escapes a character. First the pattern is compiled into an intermediate representation. Next this intermediate representation is matched against the target. The non-ascii characters are used to store control sequences in the intermediate representation: ANY Match any character STAR Match (possibly empty) sequence ALT Match, if fails, continue at + offset JMP Jump instructions ANYOF Next 16 bytes are bitmap - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ #define MAXCODE 1024 #define CBASE 0x80000000 #define ANY (CBASE+1) #define STAR (CBASE+2) #define ALT (CBASE+3) #define JMP (CBASE+4) #define ANYOF (CBASE+5) #define ANYRANGE (CBASE+6) #define EXIT (CBASE+7) #define NOCURL 0 #define CURL 1 #define M_IGNCASE 0x01 typedef unsigned int matchcode; typedef struct { tmp_buffer pattern; } compiled_pattern; static const char *compile_pattern(compiled_pattern *pattern, const char *s, int curl, int flags); static int match_pattern(matchcode *pattern, const char *s, int flags); #define Output(c) addBuffer(&Out->pattern, c, matchcode) #define Here() entriesBuffer(&Out->pattern, matchcode) #define CodeAt(o) (baseBuffer(&Out->pattern, matchcode)[o]) static void initPattern(compiled_pattern *cbuf) { initBuffer(&cbuf->pattern); } static void discardPattern(compiled_pattern *cbuf) { discardBuffer(&cbuf->pattern); initBuffer(&cbuf->pattern); } static int compilePattern(char *p, compiled_pattern *cbuf, int flags) { initPattern(cbuf); if ( compile_pattern(cbuf, p, NOCURL, flags) ) return TRUE; discardPattern(cbuf); return FALSE; } static int utf8_peek(const char *p, int n) { int c; while(n-- > 0) p = utf8_skip_char(p); PL_utf8_code_point(&p, NULL, &c); return c; } static const char * compile_pattern(compiled_pattern *Out, const char *p, int curl, int mflags) { int c; for(;;) { PL_utf8_code_point(&p, NULL, &c); switch(c) { case EOS: p--; break; case '\\': { int c2; PL_utf8_code_point(&p, NULL, &c2); Output(c2 == EOS ? c : c2); if ( c2 == EOS ) break; continue; } case '?': Output(ANY); continue; case '*': Output(STAR); continue; case '[': { size_t mapsize_at; Output(ANYOF); mapsize_at = Here(); Output(0); for(;;) { PL_utf8_code_point(&p, NULL, &c); switch( c ) { case EOS: return PL_syntax_error("Unmatched '['", NULL),NULL; case '\\': { int c2; PL_utf8_code_point(&p, NULL, &c2); if ( c2 == EOS ) return PL_syntax_error("Unmatched '['", NULL),NULL; Output(c2); continue; } case ']': { size_t sz = Here(); sz -= mapsize_at+1; CodeAt(mapsize_at) = sz; break; } default: { int ce; if ( c != ']' && utf8_peek(p, 0) == '-' && (ce=utf8_peek(p, 1)) != ']' ) { Output(ANYRANGE); Output(c); Output(ce); p = utf8_skip_char(p); p = utf8_skip_char(p); } else Output(c); continue; } } break; } continue; } case '{': { int ai, aj = -1; for(;;) { Output(ALT); ai = Here(); Output(0); if ( !(p = compile_pattern(Out, p, CURL, mflags)) ) return NULL; if ( aj > 0 ) CodeAt(aj) = Here() - aj; if ( *p == ',' ) { Output(JMP); aj = Here(); Output(0); CodeAt(ai) = Here() - ai; Output(ALT); ai = Here(); Output(0); p++; } else if ( *p == '}' ) { p++; break; } else { return PL_syntax_error("Unmatched '{'", NULL),NULL; } } continue; } case ANY: case STAR: case ALT: case JMP: case ANYOF: case ANYRANGE: case EXIT: assert(0); return PL_error(NULL, 0, "Reserved character", ERR_REPRESENTATION, ATOM_pattern),NULL; case '}': case ',': if ( curl == CURL ) { p--; return p; } /*FALLTHROUGH*/ default: if ( (mflags&M_IGNCASE) ) c = makeLowerW(c); Output(c); continue; } Output(EXIT); return p; } } static int matchPattern(const char *s, compiled_pattern *cbuf, int flags) { return match_pattern(baseBuffer(&cbuf->pattern, matchcode), s, flags); } static bool match_pattern(matchcode *p, const char *s, int flags) { matchcode c; for(;;) { switch( c = *p++ ) { case EXIT: return (*s == EOS ? TRUE : FALSE); case ANY: /* ? */ if ( *s == EOS ) return FALSE; s = utf8_skip_char(s); continue; case ANYOF: /* [...] */ { int c2; int sz = *p++; matchcode *anyend = &p[sz]; PL_utf8_code_point(&s, NULL, &c2); if ( (flags&M_IGNCASE) ) c2 = makeLowerW(c2); while(p= p[1] && c2 <= p[2] ) goto match; p += 3; } else { if ( *p == c2 ) goto match; p++; } } return FALSE; match: p = anyend; continue; } case STAR: /* * */ { int c2; do { if ( match_pattern(p, s, flags) ) return TRUE; PL_utf8_code_point(&s, NULL, &c2); } while(c2); return FALSE; } case JMP: /* { ... } */ p += *p; continue; case ALT: if ( match_pattern(p+1, s, flags) ) return TRUE; p += *p; continue; default: /* character */ { int c2; PL_utf8_code_point(&s, NULL, &c2); if ( (flags&M_IGNCASE) ) c2 = makeLowerW(c2); if ( c == c2 ) continue; return FALSE; } } } } /** wildcard_match(+Pattern, +Name [, +Options]) is semidet. */ static const PL_option_t wildcard_options[] = { { ATOM_case_sensitive, OPT_BOOL }, { NULL_ATOM, 0 } }; #define wildcard_match(pattern, string, options) LDFUNC(wildcard_match, pattern, string, options) static int wildcard_match(DECL_LD term_t pattern, term_t string, term_t options) { char *p, *s; int rc = FALSE; int mflags = 0; int case_sensitive = TRUE; if ( options && !PL_scan_options(options, 0, "wildcard_option", wildcard_options, &case_sensitive) ) return FALSE; if ( !case_sensitive ) mflags |= M_IGNCASE; PL_STRINGS_MARK(); if ( PL_get_chars(pattern, &p, CVT_ALL|REP_UTF8|CVT_EXCEPTION) && PL_get_chars(string, &s, CVT_ALL|REP_UTF8|CVT_EXCEPTION) ) { compiled_pattern buf; if ( (rc=compilePattern(p, &buf, mflags)) ) { rc = matchPattern(s, &buf, mflags); discardPattern(&buf); } } PL_STRINGS_RELEASE(); return rc; } static PRED_IMPL("wildcard_match", 2, wildcard_match, 0) { PRED_LD return wildcard_match(A1, A2, 0); } static PRED_IMPL("wildcard_match", 3, wildcard_match, 0) { PRED_LD return wildcard_match(A1, A2, A3); } /******************************* * EXPAND_FILE_NAME/2 * *******************************/ /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Wildcart expansion of a pattern to a list of files. This code uses two `buffers' for storing the intermediate results while limiting fragmentation. The `strings' buffer contains all strings generated. The files contains indices in the `strings' buffer pointing to the start of strings. The indices in the range [start,end) are valid. First this set is filled with the empty string. Next the directory-segment with the first wildcart is located. If found, we go through the current set, adding the segments without wildcarts, applying the wildcart on the directory and adding everything found to the set. The old set is deleted by incrementing info.start. If we are at the end, we add the remaining non-wildcart segments to each element of the set, deleting it if the result does not exits. Finally we sort the result. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ typedef struct { tmp_buffer files; /* our files */ tmp_buffer strings; /* our strings */ compiled_pattern pattern; /* Pattern buffer */ int start; /* 1-st valid entry of files */ int end; /* last valid entry of files */ } glob_info, *GlobInfo; #undef isspecial /* play safe */ #define isspecial(c) \ ((c) == '[' || (c) == '{' || (c) == '?' || (c) == '*') static void free_expand_info(GlobInfo info) { discardBuffer(&info->files); discardBuffer(&info->strings); discardPattern(&info->pattern); } #ifndef __WINDOWS__ #define mb_add_path(path, info) LDFUNC(mb_add_path, path, info) static void mb_add_path(DECL_LD const char *path, GlobInfo info) { int idx = (int)entriesBuffer(&info->strings, char); PL_chars_t txt; PL_STRINGS_MARK(); txt.text.t = (char*)path; txt.length = strlen(path); txt.encoding = ENC_ANSI; txt.storage = PL_CHARS_HEAP; txt.canonical = FALSE; if ( (PL_canonicalise_text(&txt) == TRUE) && PL_mb_text(&txt, REP_UTF8) ) { addMultipleBuffer(&info->strings, txt.text.t, txt.length+1, char); addBuffer(&info->files, idx, int); info->end++; } else { assert(0); } PL_free_text(&txt); PL_STRINGS_RELEASE(); } #endif static void utf8_add_path(const char *path, GlobInfo info) { int idx = (int)entriesBuffer(&info->strings, char); size_t n = strlen(path)+1; addMultipleBuffer(&info->strings, path, n, char); addBuffer(&info->files, idx, int); info->end++; } static const char * expand_str(GlobInfo info, int at) { char *s = &fetchBuffer(&info->strings, at, char); return (const char *)s; } static const char * expand_entry(GlobInfo info, int idx) { int at = fetchBuffer(&info->files, idx, int); return expand_str(info, at); } static void un_escape(char *to, const char *from, const char *end) { while( from < end ) { if ( *from == '\\' && (isspecial(from[1]) || from[1] == '\\') ) from++; *to++ = *from++; } *to = EOS; } #define utf8_exists_file(name) LDFUNC(utf8_exists_file, name) static int utf8_exists_file(DECL_LD const char *name) { #ifndef __WINDOWS__ PL_chars_t txt; int rc; PL_STRINGS_MARK(); txt.text.t = (char*)name; txt.length = strlen(name); txt.encoding = ENC_UTF8; txt.storage = PL_CHARS_HEAP; txt.canonical = FALSE; rc = ( (PL_canonicalise_text(&txt) == TRUE) && PL_mb_text(&txt, REP_FN) && AccessFile(txt.text.t, ACCESS_EXIST) ); PL_free_text(&txt); PL_STRINGS_RELEASE(); return rc; #else // _xos_access_dir simply uses _waccess(). That is good enough for us. return _xos_access_dir(name, F_OK) == 0 ? TRUE : FALSE; #endif } #define utf8_opendir(name) LDFUNC(utf8_opendir, name) static DIR* utf8_opendir(DECL_LD const char *name) { #ifndef __WINDOWS__ PL_chars_t txt; DIR *rc; PL_STRINGS_MARK(); txt.text.t = (char*)name; txt.length = strlen(name); txt.encoding = ENC_UTF8; txt.storage = PL_CHARS_HEAP; txt.canonical = FALSE; if ( (PL_canonicalise_text(&txt) == TRUE) && PL_mb_text(&txt, REP_FN) ) rc = opendir(txt.text.t); else rc = NULL; PL_free_text(&txt); PL_STRINGS_RELEASE(); return rc; #else return opendir(name); #endif } static int expand(const char *pattern, GlobInfo info) { GET_LD const char *pat = pattern; char prefix[PATH_MAX]; /* before first pattern */ char patbuf[PATH_MAX]; /* pattern buffer */ size_t prefix_len; int end, dot; int mflags = 0; if ( !truePrologFlag(PLFLAG_FILE_CASE) ) mflags |= M_IGNCASE; initBuffer(&info->files); initBuffer(&info->strings); initPattern(&info->pattern); info->start = 0; info->end = 0; utf8_add_path("", info); for(;;) { const char *s = pat, *head = pat, *tail; for(;;) { int c; switch( (c=*s++) ) { case EOS: if ( s > pat ) /* something left and expanded */ { size_t prefix_len; un_escape(prefix, pat, s); prefix_len = strlen(prefix); end = info->end; for( ; info->start < end; info->start++ ) { char path[PATH_MAX]; const char *entry = expand_entry(info, info->start); size_t plen = strlen(entry); if ( plen+prefix_len+2 <= PATH_MAX ) { strcpy(path, entry); if ( prefix[0] && plen > 0 && path[plen-1] != '/' ) path[plen++] = '/'; strcpy(&path[plen], prefix); if ( end == 1 || utf8_exists_file(path) ) utf8_add_path(path, info); } } } return TRUE; case '[': /* meta characters: expand */ case '{': case '?': case '*': break; case '\\': if ( isspecial(*s) ) { s++; continue; } /*FALLTHROUGH*/ default: if ( IS_DIR_SEPARATOR(c) ) head = s; continue; } break; } for( tail=s; *tail && !IS_DIR_SEPARATOR(*tail); tail++ ) ; /* By now, head points to the start of the path holding meta characters, while tail points to the tail: ..../meta*path/.... ^ ^ head tail */ un_escape(prefix, pat, head); un_escape(patbuf, head, tail); prefix_len = strlen(prefix); discardPattern(&info->pattern); if ( !compilePattern(patbuf, &info->pattern, mflags) ) /* syntax error */ return FALSE; dot = (patbuf[0] == '.'); /* do dots as well */ end = info->end; for(; info->start < end; info->start++) { DIR *d; struct dirent *e; char path[PATH_MAX]; char tmp[PATH_MAX]; const char *current = expand_entry(info, info->start); size_t clen = strlen(current); if ( clen+prefix_len+1 > sizeof(path) ) continue; strcpy(path, current); strcpy(&path[clen], prefix); if ( (d=utf8_opendir((path[0] ? OsPath(path, tmp) : "."))) ) { size_t plen = clen+prefix_len; if ( plen > 0 && path[plen-1] != '/' ) path[plen++] = '/'; for(e=readdir(d); e; e = readdir(d)) { if ( (dot || e->d_name[0] != '.') && matchPattern(e->d_name, &info->pattern, mflags) ) { char newp[PATH_MAX]; if ( plen+strlen(e->d_name)+1 < sizeof(newp) ) { strcpy(newp, path); strcpy(&newp[plen], e->d_name); #ifdef __WINDOWS__ /* readdir() emulation emits UTF-8 */ utf8_add_path(newp, info); #else mb_add_path(newp, info); #endif } } if ( PL_handle_signals() < 0 ) return FALSE; } closedir(d); } } pat = tail; if ( IS_DIR_SEPARATOR(*pat) ) pat++; } } static int utf8casecmp(const char *s1, const char *s2) { while(*s1 && *s2) { int c1, c2; s1 = utf8_get_char(s1, &c1); s2 = utf8_get_char(s2, &c2); c1 = makeLowerW(c1); c2 = makeLowerW(c2); if ( c1 != c2 ) return c1-c2; } return *s1 && *s2 ? 0 : *s1 ? 1 : -1; } static int compareBagEntries(const void *a1, const void *a2) { GET_LD GlobInfo info = LD->glob_info; int i1 = *(int *)a1; int i2 = *(int *)a2; const char *s1, *s2; s1 = expand_str(info, i1); s2 = expand_str(info, i2); if ( truePrologFlag(PLFLAG_FILE_CASE) ) return strcmp(s1, s2); else return utf8casecmp(s1, s2); } static void sort_expand(GlobInfo info) { GET_LD int *ip = &fetchBuffer(&info->files, info->start, int); int is = info->end - info->start; LD->glob_info = info; qsort(ip, is, sizeof(int), compareBagEntries); } static PRED_IMPL("expand_file_name", 2, expand_file_name, 0) { PRED_LD char spec[PATH_MAX]; char *s; glob_info info; term_t l = PL_copy_term_ref(A2); term_t head = PL_new_term_ref(); int i; if ( !PL_get_chars(A1, &s, CVT_ALL|REP_UTF8|CVT_EXCEPTION) ) fail; if ( strlen(s) > sizeof(spec)-1 ) return PL_error(NULL, 0, NULL, ERR_REPRESENTATION, ATOM_max_path_length); if ( !expandVars(s, spec, sizeof(spec)) ) fail; if ( !expand(spec, &info) ) goto failout; sort_expand(&info); for( i = info.start; i< info.end; i++ ) { const char *e = expand_entry(&info, i); if ( !PL_unify_list(l, head, l) || !PL_unify_chars(head, PL_ATOM|REP_UTF8, -1, e) ) goto failout; } if ( !PL_unify_nil(l) ) { failout: free_expand_info(&info); return FALSE; } free_expand_info(&info); return TRUE; } /** directory_files(+Dir, -Files) is det. Files is a list of atoms that describe the entries in Dir. */ static PRED_IMPL("directory_files", 2, directory_files, 0) { PRED_LD char *dname; DIR *dir; if ( !PL_get_file_name(A1, &dname, PL_FILE_READ|PL_FILE_OSPATH) ) return FALSE; if ( (dir=opendir(dname)) ) { struct dirent *e; term_t tail = PL_copy_term_ref(A2); term_t head = PL_new_term_ref(); for(e=readdir(dir); e; e = readdir(dir)) { PL_put_variable(head); if ( PL_handle_signals() < 0 || !PL_unify_list(tail, head, tail) || !PL_unify_chars(head, PL_ATOM|REP_FN, (size_t)-1, e->d_name) ) { closedir(dir); return FALSE; } } closedir(dir); return PL_unify_nil(tail); } return PL_error(NULL, 0, MSG_ERRNO, ERR_FILE_OPERATION, ATOM_open, ATOM_directory, A1); } /******************************* * PUBLISH PREDICATES * *******************************/ BeginPredDefs(glob) PRED_DEF("expand_file_name", 2, expand_file_name, 0) PRED_DEF("wildcard_match", 2, wildcard_match, 0) PRED_DEF("wildcard_match", 3, wildcard_match, 0) PRED_DEF("directory_files", 2, directory_files, 0) EndPredDefs