(ns net.cgrand.sjacket.parser
  "A grammar and parser for Clojure."
  (:require [net.cgrand.parsley :as p]
            [net.cgrand.regex :as re]
            [net.cgrand.regex.charset :as cs]
            [net.cgrand.regex.unicode :as unicode]))

(def macro-char (cs/charset "\";'@^`~()[]{}\\%#"))
(def terminating-macro-char (cs/- macro-char #{\# \' \%}))
(def dispatch-macro-char (cs/charset "^'\"({=!<_"))

(def whitespace-char
  (cs/- 
    (cs/+
      (unicode/cats "Zs")
      (unicode/cats "Zl")
      (unicode/cats "Zp")
      \, {\u0009 \u000D, \u001C \u001F})
    ; non-breaking spaces
    #{\u00A0, \u2007 \u202F}))

(def constituent-char
  (cs/not whitespace-char macro-char))

;; maps to readToken
(def token-char 
  (cs/not whitespace-char terminating-macro-char))

(def start-token-char
  (cs/- token-char {\0 \9} "/:" macro-char))

;; This is to allow numeric keywords, because of the outcome of
;; CLJ-1003/CLJ-1252/CLJ-1286
(def kw-char
  (cs/- token-char "/:" macro-char))

(defn token [re]
  ; TODO compute the union of two regexes
  (re/regex re (re/?! token-char)))

#_(re/regex #{\+ \-} :? 
       #{["0" (?! {\8 \9})] 
         [{\1 \9} {\0 \9} :*]
         ["0" #{\x \X} {\0 \9 \A \F \a \f} :+]
         ["0" {\0 \7} :+ (?! {\0 \9})]
         [{\1 \9} {\0 \9} :? #{\r \R} {\0 \9 \A \Z \a \z} :+]})


(def rules
  {:sexpr- #{:nil :boolean :char :string :regex :number :symbol :keyword
             :list :vector :map :set :fn
             :meta :var :deref :quote :syntax-quote :unquote :unquote-splicing
             :unreadable :eval :reader-literal}
   :nil (token "nil")
   :boolean #{(token "true") (token "false")}
   :char (p/unspaced
           [\\
            (re/regex
              cs/any-char
              (re/* constituent-char))])
   :string (p/unspaced
              [\"
               #"([^\"\\]|\\[trn\\\"bf]|\\u[0-9].{3}|\\[0-9].{0,2})*+"
               \"])

   :regex (p/unspaced
            [(re/regex \# (re/?= \"))
             \"
             #"([^\"\\]|\\.)*+"
             \"])
   ;; numbers should be validated but this is the exact "scope" of a number
   :number (re/regex (re/? #{\+ \-}) {\0 \9} (re/* constituent-char))
   :unrestricted.name (token #{"/"
                               [start-token-char (re/* (cs/- token-char \/))]})
   :sym.ns (re/regex (re/?! #{(token #{"true" "false" "nil"})
                              [#{\+ \-} {\0 \9}]})
                     start-token-char
                     (re/* (cs/- token-char \/))
                     (re/?= \/))
   :sym.name (re/regex
               (re/?! #{(token #{"true" "false" "nil"})
                        [#{\+ \-} {\0 \9}]})
               (token
                 #{"/"
                   [(cs/+ start-token-char \%) (re/* (cs/- token-char \/))]}))
   :symbol #{(p/unspaced :sym.ns "/" :unrestricted.name)
             :sym.name}

   :kw.ns (re/regex kw-char
                    (re/* kw-char)
                    (re/?= \/))

   :kw.name (token #{"/"
                     [kw-char (re/* (cs/- kw-char \/))]})

   :keyword [(re/regex (re/repeat ":" 1 2))
             #{(p/unspaced :kw.ns "/" :kw.name)
               (p/unspaced :kw.name)}]
   :list ["(" :sexpr* ")"]
   :vector ["[" :sexpr* "]"]
   :map ["{" :sexpr* "}"]
   :set ["#{" :sexpr* "}"]
   :fn ["#(" :sexpr* ")"]
   :meta [#"#?\^" :sexpr :sexpr]
   :var ["#'" :symbol]
   :deref ["@" :sexpr]
   :quote ["'" :sexpr]
   :syntax-quote ["`" :sexpr]
   :unquote [#"~(?!@)" :sexpr]
   :unquote-splicing ["~@" :sexpr]
   :eval ["#=" :list]
   :reader-literal [(re/regex \# (re/?! dispatch-macro-char)) :symbol :sexpr]
   
   :comment (p/unspaced #{";" "#!"} #"[^\n]*")
   :unreadable "#<"
   :discard ["#_" :sexpr]
   
   :newline \newline
   :whitespace (re/regex (re/+ (cs/- whitespace-char \newline)))})

(def space-nodes #{:comment :discard :newline :whitespace})

(def parser 
  (p/make-parser {:main :sexpr*
                  :space [space-nodes :*]
                  :root-tag ::root}
                 rules))