;;; pyim-dregcache --- map dictionary to plain cache and use regular expression to search     -*- lexical-binding: t; -*-

;; * Header
;; Copyright (C) 2019-2021 Free Software Foundation, Inc.

;; Author: Chen Bin <chenbin.sh@gmail.com>
;; Maintainer: Chen Bin <chenbin.sh@gmail.com>
;;             Feng Shu <tumashu@163.com>
;; URL: https://github.com/tumashu/pyim
;; Keywords: convenience, Chinese, pinyin, input-method

;; This file is part of GNU Emacs.

;; GNU Emacs is free software: you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.

;; GNU Emacs is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;; GNU General Public License for more details.

;; You should have received a copy of the GNU General Public License
;; along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.

;;; Commentary:

;; * 说明文档                                                              :doc:
;; 这个文件将词典直接映射到内存.使用正则表达式(Regular Expression)搜索词库.
;; 搜索速度较快,消耗内存极少.
;;
;; 可以 (setq pyim-dcache-backend 'pyim-dregcache) 然后重启输入法启用此引擎

;;; Code:
;; * 代码                                                                 :code:
(require 'pyim-common)
(require 'subr-x)

(defvar pyim-dregcache-partition-minimum-size 32
  "小于这个数值(单位为M)的词典不需要用分区算法加速.可节约一半的内存.")

(defvar pyim-dregcache-cache nil)
(defvar pyim-dregcache-icode2word nil)
(defvar pyim-dregcache-iword2count nil)
(defvar pyim-dregcache-dicts-md5 nil)

(defun pyim-dregcache-variable-file (variable)
  "Get VARIABLE dcache file path."
  (concat (file-name-as-directory pyim-dcache-directory)
          (symbol-name variable)))

(defun pyim-dregcache-save-variable (variable value)
  "Save VARIABLE with its VALUE."
  (let* ((file (pyim-dregcache-variable-file variable))
         (save-silently t))
    (make-directory (file-name-directory file) t)
    (with-temp-buffer
      (insert value)
      (pyim-dcache-write-file file))))

(defun pyim-dregcache-load-variable (variable)
  "载入 VARIABLE 对应的文件内容."
  (let* ((file (pyim-dregcache-variable-file variable)))
    (when (and file (file-exists-p file))
      (with-temp-buffer
        (insert-file-contents file)
        (buffer-string)))))

(defun pyim-dregcache-sort-words (words-list)
  "对 WORDS-LIST 排序，词频大的排在前面.

排序使用 `pyim-dregcache-iword2count' 中记录的词频信息"
  (sort words-list
        (lambda (a b)
          (let ((a (car (split-string a ":")))
                (b (car (split-string b ":"))))
            (> (or (gethash a pyim-dregcache-iword2count) 0)
               (or (gethash b pyim-dregcache-iword2count) 0))))))

(defun pyim-dregcache-sort-icode2word ()
  "对个人词库排序."
  ;; https://github.com/redguardtoo/zhfreq
  (with-temp-buffer
    (dolist (l (split-string pyim-dregcache-icode2word "\n"))
      (cond
       ((string-match "^\\([a-z-]+ \\)\\(.*\\)" l)
        ;; 3字以上词很少，如果只处理单字,2字词,3字词
        ;; ((string-match "^\\([a-z]+ \\|[a-z]+-[a-z]+ \\|[a-z]+-[a-z]+-[a-z]+ \\)\\(.*\\)" l)
        (let* ((pinyin (match-string 1 l))
               (words (pyim-dregcache-sort-words (split-string (match-string 2 l) " "))))
          (insert (format "%s\n" (concat pinyin (mapconcat #'identity words " "))))))
       ;; 其他词
       ((string= l "")
        ;; skip empty line
        )
       (t
        (insert (format "%s\n" l)))))
    (setq pyim-dregcache-icode2word (buffer-string))))

(defun pyim-dregcache-create-cache-content (raw-content)
  "将 RAW-CONTENT 划分成可以更高效搜索的缓冲区."
  (let* (rlt)
    (cond
     ;; 小词库不用划分"子搜索区域".
     ;; `pyim-dregcache-partition-minimum-size'定义了小词库的最大值
     ((< (length raw-content) (* pyim-dregcache-partition-minimum-size 1024 1024))
      (setq rlt (list :content raw-content)))
     (t
      (let* ((chars "bcdefghjklmnopqrstwxyz")
             pattern
             (i 0)
             dict-splited
             (content-segments '())
             (start 0)
             end)
        ;; 将字典缓存划分成多个"子搜索区域"
        (while (< i (length chars))
          (setq pattern (concat "^" (string (elt chars i))))
          (setq end (string-match pattern raw-content start))
          (when end
            (setq content-segments
                  `(,@content-segments ,(substring-no-properties raw-content start end)))
            (setq dict-splited t)
            ;; 将搜索起始点前移
            (setq start end))
          (setq i (1+ i)))

        (cond
         ;; attach segments
         (dict-splited
          ;; 将剩余的附后
          (setq content-segments
                `(,@content-segments ,(substring-no-properties raw-content start end)))
          (setq rlt (list :content content-segments)))
         (t
          (setq rlt (list :content raw-content)))))))
    rlt))

(defun pyim-dregcache-load-dictionary-file (dict-file)
  "READ from DICT-FILE."
  (let* ((raw-content (with-temp-buffer
                        (insert-file-contents dict-file)
                        (buffer-string))))
    (setq pyim-dregcache-cache
          ;; use string type as key, so have to use `lax-plist-put'
          ;; @see https://www.gnu.org/software/emacs/manual/html_node/elisp/Plist-Access.html#Plist-Access
          (lax-plist-put pyim-dregcache-cache
                         (file-truename dict-file)
                         (pyim-dregcache-create-cache-content raw-content)))))

(defun pyim-dregcache-update-code2word (dict-files dicts-md5 &optional force)
  "读取并加载词库.

读取 `pyim-dicts' 和 `pyim-extra-dicts' 里面的词库文件，生成对应的
词库缓冲文件，然后加载词库缓存。

DICT-FILES 是词库文件列表. DICTS-MD5 是词库的MD5校验码.

如果 FORCE 为真，强制加载。"
  (interactive)
  (when (or force (not (equal dicts-md5 pyim-dregcache-dicts-md5)))
    ;; no hashtable i file mapping algorithm
    (dolist (file dict-files)
      (pyim-dregcache-load-dictionary-file file))
    (setq pyim-dregcache-dicts-md5 dicts-md5)))

(defmacro pyim-dregcache-shenmu2regexp (char)
  "将声母 CHAR 转换为通用正则表达式匹配所有以该声母开头的汉字."
  `(concat ,char "[a-z]*"))

(defmacro pyim-dregcache-is-shenmu (code)
  "判断CODE 是否是一个声母."
  `(and (eq (length ,code) 1)
        (not (string-match ,code "aeo"))))

(defun pyim-dregcache-code2regexp (code)
  "将 CODE 转换成正则表达式用来搜索辞典缓存中的匹配项目.
单个声母会匹配所有以此生母开头的单个汉字."
  (let* (arr)
    (cond
     ((pyim-dregcache-is-shenmu code)
      ;; 用户输入单个声母如 "w", "y", 将该声母转换为
      ;; 通用正则表达式匹配所有以该声母开头的汉字
      (pyim-dregcache-shenmu2regexp code))

     ((eq (length (setq arr (split-string code "-"))) 1)
      ;; 用户输入单个汉字的声母和韵母如 "wan", "dan", 不做任何处理
      code)

     (t
      ;; 用户输入多字词的code,如果某个字只提供了首个声母,将该声母转换为
      ;; 通用正则表达式匹配所有以该声母开头的汉字
      (let* ((s (mapconcat (lambda (e)
                             (if (pyim-dregcache-is-shenmu e)
                                 (pyim-dregcache-shenmu2regexp e)
                               e))
                           arr
                           "-")))
        ;; 当输入3字拼音时,也搜索前3字包含此拼音的更长的词.
        ;; 三字词很少(可 grep -rEsoh "^[a-z]+(-[a-z]+){2}( [^ ]+){2,6}" 验证)
        ;; 用户通常输入4字或更多字的词.
        ;; 例如符合 bei-jin-tian 的词很少, "北京天安门"是更可能匹配的词
        ;; `pyim-dregcache' 搜索算法可以保证更长的词在靠后的位置
        (cond
         ((< (length arr) 3)
          s)
         ((eq ?* (elt s (1- (length s))))
          ;; 简化正则表达式, tian-an-m[a-z]* => tian-an-m[a-z]?[a-z-]*
          (concat (substring s 0 (1- (length s))) "?[a-z-]*"))
         (t
          ;; tian-an-men => tian-an-men[a-z-]*
          (concat s "[a-z-]*"))))))))

(defmacro pyim-dregcache-match-line (code)
  `(concat "^" (pyim-dregcache-code2regexp ,code) " \\(.+\\)"))

(defun pyim-dregcache-all-dict-files ()
  "所有词典文件."
  (let* (rlt)
    (dolist (item pyim-dregcache-cache)
      (when (stringp item)
        (push item rlt)))
    ;; restore user's dictionary order
    (nreverse rlt)))

(defun pyim-dregcache-get-content (code file-info)
  "找到 CODE 对应的字典子缓冲区.  FILE-INFO 是字典文件的所有信息."
  (let* ((rlt (plist-get file-info :content))
         idx
         (ch (elt code 0)))
    (when (listp rlt)
      (cond
       ((<= ch ?h)
        (setq idx (- ch ?a)))
       ((<= ch ?t)
        ;; 'i' could not be first character of pinyin code
        (setq idx (- ch ?a 1)))
       (t
        ;; 'i', 'u', 'v' could not be first character of pinyin code
        (setq idx (- ch ?a 3))))
      ;; fetch segment using the first character of pinyin code
      (setq rlt (nth idx rlt)))
    rlt))

(defun pyim-dregcache-get-1 (content code)
  (let ((case-fold-search t)
        (start 0)
        (pattern (pyim-dregcache-match-line code))
        (content-length (length content))
        word
        output)
    (while (and (< start content-length)
                (setq start (string-match pattern content start)))
      ;; 提取词
      (setq word (match-string-no-properties 1 content))
      (when word
        (cond
         ((string-match "^[^ ]+$" word)
          ;; 单个词
          (push word output))
         (t
          ;; 多个字
          (setq output (append (nreverse (split-string word " +")) output)))))
      ;; 继续搜索
      (setq start (+ start 2 (length code) (length word))))
    output))

(defun pyim-dregcache-get (code &optional from)
  "从 `pyim-dregcache-cache' 搜索 CODE, 得到对应的词条."
  (if (or (memq 'icode2word from)
          (memq 'ishortcode2word from))
      (pyim-dregcache-get-icode2word-ishortcode2word code)
    (let ((dict-files (pyim-dregcache-all-dict-files))
          result)
      (when pyim-debug (message "pyim-dregcache-get is called. code=%s" code))
      (when dict-files
        (dolist (file dict-files)
          (let* ((file-info (lax-plist-get pyim-dregcache-cache file))
                 (content (pyim-dregcache-get-content code file-info)))
            (setq result (append (pyim-dregcache-get-1 content code) result)))))
      ;; `push' plus `nreverse' is more efficient than `add-to-list'
      ;; Many examples exist in Emacs' own code
      (setq result (nreverse result))
      `(,@result ,@(pyim-pymap-py2cchar-get code t t)))))

(defun pyim-dregcache-get-icode2word-ishortcode2word (code)
  "以 CODE 搜索个人词和个人联想词.  正则表达式搜索词库,不需要为联想词开单独缓存."
  (when pyim-debug (message "pyim-dregcache-get-icode2word-ishortcode2word called => %s" code))
  (when pyim-dregcache-icode2word
    (nreverse (pyim-dregcache-get-1 pyim-dregcache-icode2word code))))

(defun pyim-dregcache-update-personal-words (&optional force)
  "合并 `pyim-dregcache-icode2word' 磁盘文件. 加载排序后的结果.

在这个过程中使用了 `pyim-dregcache-iword2count' 中记录的词频信息。
如果 FORCE 为真，强制排序"
  (let* ((content (pyim-dregcache-load-variable 'pyim-dregcache-icode2word)))
    (when content
      (with-temp-buffer
        (let* (prev-record prev-code record code prev-point)
          (when pyim-dregcache-icode2word
            (insert pyim-dregcache-icode2word))
          (insert content)
          (sort-lines nil (point-min) (point-max))
          (delete-duplicate-lines (point-min) (point-max) nil t nil)
          (goto-char (point-min))
          (unless (re-search-forward "utf-8-unix" (line-end-position) t)
            (insert "## -*- coding: utf-8-unix -*-\n"))
          (goto-char (point-min))
          (while (not (eobp))
            ;; initiate prev-point and prev-line
            (goto-char (line-beginning-position))
            (setq prev-point (point))
            (setq prev-record (pyim-dline-parse))
            (setq prev-code (car prev-record))
            ;; 词库在创建时已经保证1个code只有1行，因此合并词库文件一般只有2行需要合并
            ;; 所以这里没有对2行以上的合并进行优化
            (when (= (forward-line 1) 0)
              (setq record (pyim-dline-parse))
              (setq code (car record))
              (when (string-equal prev-code code)
                ;; line-end-position donesn't contain "\n"
                (progn (delete-region prev-point (line-end-position))
                       (insert (string-join (delete-dups `(,@prev-record ,@record)) " ")))))))
        (setq pyim-dregcache-icode2word (buffer-string)))))
  (when (and force pyim-dregcache-icode2word)
    (pyim-dregcache-sort-icode2word)))

(defun pyim-dregcache-init-variables ()
  "初始化 cache 缓存相关变量."
  (pyim-dcache-set-variable
   'pyim-dregcache-iword2count
   nil
   ;; dregcache 引擎也需要词频信息，第一次使用 dregcache 引擎的时候，
   ;; 自动导入 dhashcache 引擎的词频信息，以后两个引擎的词频信息就
   ;; 完全分开了。
   (pyim-dcache-get-variable 'pyim-dhashcache-iword2count))
  (unless pyim-dregcache-icode2word
    (pyim-dregcache-update-personal-words t)))

(defun pyim-dregcache-save-personal-dcache-to-file ()
  "保存缓存内容到默认目录."
  (when pyim-debug (message "pyim-dregcache-save-personal-dcache-to-file called"))
  ;; 用户选择过的词存为标准辞典格式保存
  (when pyim-dregcache-icode2word
    (pyim-dregcache-save-variable 'pyim-dregcache-icode2word
                                  pyim-dregcache-icode2word))
  ;; 词频
  (pyim-dcache-save-variable 'pyim-dregcache-iword2count))

(defun pyim-dregcache-insert-export-content ()
  "TODO"
  )

(defun pyim-dregcache-update-iword2count (word &optional _prepend wordcount-handler)
  "保存词频到缓存."
  (when pyim-debug (message "pyim-dregcache-update-iword2count. word=%s" word))
  (let* ((orig-value (gethash word pyim-dregcache-iword2count))
         (new-value (cond
                     ((functionp wordcount-handler)
                      (funcall wordcount-handler orig-value))
                     ((numberp wordcount-handler)
                      wordcount-handler)
                     (t (+ (or orig-value 0) 1)))))
    (unless (equal orig-value new-value)
      (puthash word new-value pyim-dregcache-iword2count))))

(defun pyim-dregcache-delete-word (word)
  "将中文词条 WORD 从个人词库中删除."
  (with-temp-buffer
    (insert pyim-dregcache-icode2word)
    (goto-char (point-min))
    (let* ((case-fold-search t)
           substring beg end)
      (while (re-search-forward (concat "^\\([a-z-]+\\) \\(.*\\)" word "\\(.*\\)$") nil t)
        (setq beg (match-beginning 0))
        (setq end (match-end 0))
        (setq substring (concat (match-string-no-properties 1)
                                (match-string-no-properties 2)
                                (match-string-no-properties 3)))

        ;; delete string and the newline char
        (delete-region beg (+ 1 end))
        (when (> (length (split-string substring " ")) 1)
          (goto-char beg)
          (insert substring)))
      (setq pyim-dregcache-icode2word
            (buffer-string))))
  ;; 删除对应词条的词频
  (remhash word pyim-dregcache-iword2count))

(defun pyim-dregcache-insert-word-into-icode2word (word code prepend)
  "保存个人词到缓存,和其他词库格式一样以共享正则搜索算法."
  (when pyim-debug
    (message "pyim-dregcache-insert-word-into-icode2word called => %s %s %s"
             word
             code
             prepend))
  (with-temp-buffer
    (when pyim-dregcache-icode2word
      (insert pyim-dregcache-icode2word))
    (goto-char (point-min))
    (let* ((case-fold-search t)
           substring replace-string beg end old-word-list)
      (if (re-search-forward (concat "^" code " \\(.*\\)") nil t)
          (progn
            (setq beg (match-beginning 0))
            (setq end (match-end 0))
            (setq substring (match-string-no-properties 1))
            (delete-region beg end)
            ;; 这里不进行排序，在pyim-dregcache-update-personal-words排序
            (setq old-word-list (pyim-dregcache-sort-words (split-string substring " ")))
            (setq replace-string (concat code " " (string-join (delete-dups `(,@old-word-list ,word)) " "))))
        (setq replace-string (concat code " " (or replace-string word) "\n")))
      (goto-char (or beg (point-max)))
      (insert replace-string))
    (setq pyim-dregcache-icode2word
          (buffer-string))))

(defun pyim-dregcache-upgrade-icode2word ()
  "升级 icode2word 缓存。

dregcache 只支持全拼和双拼，不能用于五笔之类的型码输入法，而
update-icode2word 目前只要是用于更新型码输入法的 code-prefix, 所
以不需要具体实现细节。")

(defun pyim-dregcache-search-word-code-1 (word content)
  (let* ((case-fold-search t)
         (regexp (concat "^\\([a-z-]+\\)\\(.*\\) " "\\(" word " \\|" word "$\\)")))
    (when (string-match regexp content)
      (match-string-no-properties 1 content))))

(defun pyim-dregcache-search-word-code (word)
  "从 `pyim-dregcache-cache' 和 `pyim-dregcache-icode2word' 搜索 word, 得到对应的code."
  (when pyim-debug (message "pyim-dregcache-search-word-code word=%s" word))
  (when pyim-dregcache-cache
    (catch 'result
      (let ((dict-files (pyim-dregcache-all-dict-files))
            code)
        (when pyim-dregcache-icode2word
          (setq code (pyim-dregcache-search-word-code-1 word pyim-dregcache-icode2word))
          (when code (throw 'result (list code))))
        (dolist (file dict-files)
          (let* ((file-info (lax-plist-get pyim-dregcache-cache file))
                 (contents (lax-plist-get file-info :content)))
            (dolist (content (if (listp contents) contents (list contents)))
              (setq code (pyim-dregcache-search-word-code-1 word content))
              (when code (throw 'result (list code))))))))))

(defun pyim-dregcache-export-personal-words (file &optional confirm)
  "将个人词库存入 FILE."
  (when pyim-dregcache-icode2word
    ;; 按词频排序，把词频信息保存到用户词典
    (pyim-dregcache-sort-icode2word)
    (with-temp-buffer
      (insert pyim-dregcache-icode2word)
      ;; 删除单字词
      (goto-char (point-min))
      (while (re-search-forward "^[a-z]+ [^a-z]*" nil t)
        (replace-match "" nil t))
      ;; 按拼音排序
      (sort-lines nil (point-min) (point-max))
      (pyim-dcache-write-file file confirm))))

;; * Footer

(provide 'pyim-dregcache)
;;; pyim-dregcache.el ends here