% Configure file search paths for different biological data categories, enabling easy reference to data files.
user:file_search_path(gencode, './metta_out_v4_pl/gencode').
user:file_search_path(uniprot, './metta_out_v4_pl/uniprot').
user:file_search_path(ontology, './metta_out_v4_pl/ontology').
user:file_search_path(gaf, './metta_out_v4_pl/gaf').
user:file_search_path(cellxgene, './metta_out_v4_pl/cellxgene').
user:file_search_path(eqtl, './metta_out_v4_pl/gtex/eqtl').
user:file_search_path(tadmap, './metta_out_v4_pl/tadmap').
user:file_search_path(refseq, './metta_out_v4_pl/refseq').

%%% FACT FILES

% Declare which files contain facts, aiding in modular loading of data.
fact_files(eqtl(edges)).
fact_files(gencode(nodes)).  fact_files(gencode(edges)).
fact_files(uniprot(nodes)).  fact_files(uniprot(edges)).
fact_files(ontology(nodes)). fact_files(ontology(edges)).
fact_files(gaf(edges)).
fact_files(cellxgene(edges)).
fact_files(tadmap(edges)).
fact_files(refseq(edges)).

% Load Constraint Logic Programming over Finite Domains library.
:- ensure_loaded(library(clpfd)).
% :- use_module(library(sldnfdraw)).
% :- sldnf.


%%% RULES START
rules_start.  % Rules (for use in proofs) come after this

% Define a tabled predicate codes_for/2 to avoid recomputation of results.
:- table(codes_for/2).
codes_for(G, P) :-
    transcribed_to(G, T),
    translates_to(T, P).

% Define a tabled predicate in_tad_with/2. Tables speed up recursive predicates by memoization.
:- table(in_tad_with/2).
in_tad_with(S, G1) :-
    different(G1,G2),
    in_tad_region(G1, T),
    in_tad_region(G2, T),
    closest_gene(S, G2).

% Table the predicate relevant_gene/2 for efficient querying.
:- table(relevant_gene/2).
relevant_gene(G, S) :-
    in_tad_with(S, G),
    eqtl(S, G).

% Table relevant_gene_coexpression/2 to memoize results for performance.
:- table(relevant_gene_coexpression/2).
relevant_gene_coexpression(G1, S) :-
    relevant_gene(G2, S),
    different(G1, G2),
    coexpressed_with(G1, G2).

% Table member_nat/3 for optimization.
:- table(member_nat/3).
member_nat(G, O, 0) :-
    codes_for(G, P),
    go_gene_product(O, P).
member_nat(G, O, s(D0)) :-
    rel_type(ontology_relationship(X, O), subclass),
    member(G, X, D0).

% Another version of member/3 that uses clpfd for the depth.
:- table(member/3).
member(G, O, 0) :-
    codes_for(G, P),
    go_gene_product(O, P).
member(G, O, D) :-
    D #= D0 + 1,
    rel_type(ontology_relationship(X, O), subclass),
    member(G, X, D0).

% Table relevant_go/2 to cache its results.
:- table(relevant_go/2).
relevant_go(O, S) :-
    (relevant_gene(G, S)
    ; relevant_gene_coexpression(G, S)),
     member(G, O, _D).  % can swap this out for speed checks

%%% RULES END
rules_end.


% Ensures two terms, G1 and G2, are different using the standard dif predicate.
different(G1,G2):-
    dif:dif(G1,G2).
    % The dif:dif(G1, G2) ensures G1 and G2 cannot unify now or in the future,
    % supporting logical purity by allowing Prolog to delay evaluation until
    % enough information is available.

% Ensures two terms, G1 and G2, are different by deferring the inequality check
% until either G1 or G2 is instantiated.
different_f(G1,G2):-
    % freeze/2 delays execution of its second argument (a goal) until its first
    % argument (G1 or G2 here) is instantiated.
    freeze(G1, G1\==G2),
    freeze(G2, G2\==G1).
    % G1\==G2 is the goal that checks if G1 and G2 are not strictly equal.
    % If either G1 or G2 becomes instantiated, Prolog then checks if this goal can succeed.
    % This approach defers the check for G1 and G2 being different until
    % one of them gets a concrete value, at which point the inequality check is performed.


% Example queries/tests that demonstrate how to use the defined predicates and facts.
tests :- solve(relevant_go(ontology_term(go_0045598), sequence_variant(rs1421085))).
tests :- solve(relevant_go(_O, sequence_variant(rs1421085))).
tests :- solve(_Any).

% List out the test predicates for inspection.
:- listing(tests).

/*
?- solve(relevant_go(ontology_term(go_0045598), sequence_variant(rs1421085))).

Proof=
        proven(relevant_go(ontology_term(go_0045598),sequence_variant(rs1421085))) -->
          ( rule(relevant_go(A,B)) :-
              relevant_gene(C,B);relevant_gene_coexpression(C,B) ,
              member(C,A,_)  ,
            or_r( proven(relevant_gene_coexpression(gene(ensg00000175602),sequence_variant(rs1421085))) -->
                    ( rule(relevant_gene_coexpression(D,E)) :-
                        relevant_gene(F,E) ,
                        different(D,F) ,
                        coexpressed_with(D,F)  ,
                      proven(relevant_gene(gene(ensg00000177508),sequence_variant(rs1421085))) -->
                        ( rule(relevant_gene(G,H)):-in_tad_with(H,G),eqtl(H,G)  ,
                          proven(in_tad_with(sequence_variant(rs1421085),gene(ensg00000177508))) -->
                            ( rule(in_tad_with(I,J)) :-
                                different(J,K) ,
                                in_tad_region(J,L) ,
                                in_tad_region(K,L) ,
                                closest_gene(I,K)  ,
                              built_in(different(gene(ensg00000177508),gene(ensg00000140718))) ,
                              fact(in_tad_region(gene(ensg00000177508),tad(chr16_53550000_55450000_grch38))) ,
                              fact(in_tad_region(gene(ensg00000140718),tad(chr16_53550000_55450000_grch38))) ,
                              fact(closest_gene(sequence_variant(rs1421085),gene(ensg00000140718)))) ,
                          fact(eqtl(sequence_variant(rs1421085),gene(ensg00000177508)))) ,
                      built_in(different(gene(ensg00000175602),gene(ensg00000177508))) ,
                      fact(coexpressed_with(gene(ensg00000175602),gene(ensg00000177508))))) ,
            proven(member(gene(ensg00000175602),ontology_term(go_0045598),1)) -->
              ( rule(member(M,N,O)) :-
                    ((
                     (integer(O) ->
                        P=O,clpfd:clpfd_equal(P,Q+1));
                     (clpfd: clpfd_equal(O,Q+1))  )) ,
                  rel_type(ontology_relationship(R,N),subclass) ,
                  member(M,R,Q)  ,
                fact(rel_type(ontology_relationship(ontology_term(go_0045599),ontology_term(go_0045598)),subclass)) ,
                proven(member(gene(ensg00000175602),ontology_term(go_0045599),0)) -->
                  ( rule(member(S,T,0)):-codes_for(S,U),go_gene_product(T,U)  ,
                    proven(codes_for(gene(ensg00000175602),protein(q15834))) -->
                      ( rule(codes_for(V,W)) :-
                          transcribed_to(V,X) ,
                          translates_to(X,W)  ,
                        fact(transcribed_to(gene(ensg00000175602),transcript(enst00000312579))) ,
                        fact(translates_to(transcript(enst00000312579),protein(q15834)))) ,
                    fact(go_gene_product(ontology_term(go_0045599),protein(q15834)))) ,
                or_swap(or_l(built_in(clpfd:clpfd_equal(1,0+1))))))


*/

% ==================================================
% DONT LAUGH BUT DOUGLAS LET GPT-4 COMMENT THIS FILE
% ==================================================

% The core functionality revolves around the 'solve/1' and 'solve/2' predicates,
% which are central to executing logical queries within this framework. 'solve/1'
% aims to find solutions to logical goals without repeats, leveraging a pretty
% print function 'ppp' for clear output. 'solve/2' extends this by handling
% unbound variable goals, directly solving truths, managing negations, and optimizing
% logical operations based on predefined heuristics for efficiency.

% Additionally, the script takes care of loading external fact files specified
% by the user, marking them for reference and ensuring they're loaded only once.
% This functionality supports the system's adaptability to new or updated datasets,
% making it a versatile tool for logical reasoning tasks. To enhance performance,
% garbage collection can be toggled, offering control over the system's resource
% management during intensive operations.

% Beyond these, the script employs meta-programming techniques to dynamically
% track and assert facts and rule heads for later use or inspection.  Utility predicates
% for calculating predicate sizes and prioritizing goal execution further contribute
% to the system's efficiency and adaptability.

% Set Prolog flags to reduce verbosity during autoload and loading phases.
:- set_prolog_flag(verbose_autoload,false),
   set_prolog_flag(verbose_load,false),
   ensure_loaded(library(logicmoo_common)).


% Declare a dynamic predicate to keep track of which fact files have been loaded.
:- dynamic(is_fact_file_name/1).
% Load all fact files specified by 'fact_files/1' predicates, ensuring they are found and readable.
load_fact_files:- forall(fact_files(Spec),load_fact_file(Spec)).
load_fact_file(Spec):-
  absolute_file_name(Spec,File,
     [access(read), file_errors(fail), file_type(prolog)]),
  load_fact_file_name(File).

% If a fact file has not been previously loaded, record its name and load the file.
load_fact_file_name(File):-
  is_fact_file_name(File),!. % Skip if file is already marked as loaded.
load_fact_file_name(File):-
  assert(is_fact_file_name(File)), % Mark the file as loaded.
  time(ensure_loaded(File)). % Load the file, measuring the time taken.

% Trigger the loading process for all specified fact files.
:- time(load_fact_files).


% Assert a given Clause if it's not already true; otherwise, just indicate it's already present.
assert_singly(Clause):- notrace(catch(Clause,_,fail)),!,writeln(present(Clause)).
assert_singly(Clause):- assert(Clause).

%%% FIND AND CACHE RULES HEADS
% Rebuilt if the file is releaded anyways
:- abolish(rule_head/1).
% Declare dynamic predicate for tracking rule heads from introspection.
:- dynamic(rule_head/1).
% Utility predicate to find all rule heads in the file using introspection.
find_rule_head(Head):-
    % Get the line number where 'rules_start' and 'rules_end' are defined.
    predicate_property(rules_start, line_count(LineNumberStart)),
    predicate_property(rules_end, line_count(LineNumberEnd)),
    % Get the file (this file name) in which 'rules_start' is defined.
    source_file(rules_start, ThisFile),
 % We iterate from here
    % Ensure 'Head' is defined in the same file as 'find_rule_head'.
    source_file(Head, ThisFile),
    % Get the line number where 'Head' is defined.
    predicate_property(Head, line_count(LineNumber)),
    % Make sure that line is beteeen rule_state and rule_end
    LineNumberStart < LineNumber,  LineNumber < LineNumberEnd,
    % Ensure that 'Head' has at least one rule associated with it.
    \+ predicate_property(Head, number_of_rules(0)).
% For each rule head found, assert it as a 'rule_head' fact for later use or inspection.
:- forall(find_rule_head(Head),assert_singly(rule_head(Head))).
% List out all asserted 'rule_head' facts for review.
:- listing(rule_head/1).
/* should return
rule_head(member(_,_,_)).
rule_head(relevant_go(_, _)).
rule_head(relevant_gene_coexpression(_, _)).
rule_head(relevant_gene(_, _)).
rule_head(in_tad_with(_, _)).
rule_head(codes_for(_, _)).
*/


%%% FACT HEADS
% Rebuilt if the file is reloaded anyways
:- abolish(fact_head/1).
% Declare dynamic predicate for tracking fact heads for introspection.
:- dynamic(fact_head/1).
% Define a predicate to find the head of any fact defined in loaded fact files.
find_fact_head(Head):-
    % Check if 'File' is among the names of fact files that have been loaded.
    is_fact_file_name(File),
    % Find a predicate 'Head' that is defined in 'File'.
    source_file(Head,File).
% For each fact head found, assert it as a 'fact_head' fact for later use or inspection.
:- forall(find_fact_head(Head),assert_singly(fact_head(Head))).
% List out all asserted 'fact_head' facts for review.
:- listing(fact_head/1).
/* should return
fact_head(gene(_)).
fact_head(closest_gene(_, _)).
fact_head(in_tad_region(_, _)).
fact_head(transcribed_to(_, _)).
fact_head(translates_to(_, _)).
fact_head(rel_type(_, _)).
fact_head(eqtl(_, _)).
fact_head(go_gene_product(_, _)).
*/


% Disable the Prolog garbage collector. (uncomnment so that whn you press control C you can see all stack frames before theyu get GC'd
%:- set_prolog_flag(gc,false).

% Attempt to find a solution for 'Goal' and print its proof without repeats.
solve(Goal):-
  % Use 'no_repeats' to ensure each solution for each 'Goal/Proof' pair is unique before proceeding.
  no_repeats(solve(Goal, Proof)),
  % Call 'ppp' to pretty print the found 'Proof'.
  ppp(Proof).

% Define 'ppp' to format and print proofs in a readable manner.
ppp(Proof):-
 % Use 'format' to structure the output, introducing the proof with a newline and indent.
 format('~N~nProof=~n\t'),
 % Employ double negation to ensure 'Proof' variables are universally quantified, making the output cleaner.
 % 'numbervars' attributes unique numbers to variables, enhancing readability.
 \+ \+ (numbervars(Proof,0,_,[attvars(skip), singletons(true)]),pppt(Proof)).
% Utilize 'pppt' to actually print the structured proof tree.
pppt(Proof):-
  % 'print_tree' is called to visually represent 'Proof' in a tree structure, followed by newlines for separation.
  print_tree(Proof),nl,nl.

% Overloaded 'solve' for handling goals and associated proofs.
% Handle unbound Goal by trying all known rules.
solve(Goal, Proof):- var(Goal), rule_head(Goal),solve(Goal, Proof).
% Handle unbound Goal by returning all known facts.
solve(Goal, Proof):- var(Goal), fact_head(Goal),solve(Goal, Proof).
% Directly solve for 'true', which always succeeds.
solve(true, true) :- !.
% Directly solve for 'fail', which always fails.
solve(fail, _) :- !, fail.
% Handle logical negation through Trie-Not
solve(not(Goal), tnot(Proof)) :- !, tnot(solve(Goal, Proof)).
% Handle negation by failure
solve( \+ (Goal), naf(Proof)) :- !, \+ solve(Goal, Proof).
% Optimize the order of AND ('Goal1, Goal2') goals based on a heuristic.
% Goal3 exists to make sure Goal2 is a Literal (nonconnective Goal)
solve((Goal1, Goal2, Goal3), (Proof2, Proof13)) :-
  % Prefer solving the second goal first if it's determined to be "better".
  second_is_better_than_first(Goal1,Goal2),!,
  % Recursively solve each goal in the optimized order.
  solve(Goal2, Proof2),solve((Goal1, Goal3), Proof13).
solve((Goal1, Goal2), (Proof2, Proof1)) :- Goal2 \= (_,_),
  % Prefer solving the second goal first if it's nonconjunctions and "better"
  second_is_better_than_first(Goal1,Goal2),!,
  % Recursively solve each goal in the optimized order.
  solve(Goal2, Proof2),solve(Goal1,Proof1).

% Optimize the order of OR ('Goal1; Goal2') goals using the same heuristic.
solve((Goal1; Goal2), or_swap(Proof)) :-
  % Swap goals if the second is "better".
  second_is_better_than_first(Goal1,Goal2),!,
  % Attempt to solve the goals in the new order.
  solve((Goal2; Goal1), Proof).

% Recursively solve conjunctions ('Goal1, Goal2') in the provided order.
solve((Goal1, Goal2), (Proof1 , Proof2)) :- !,
    % Solve each part of the conjunction separately.
    solve(Goal1, Proof1),
    solve(Goal2, Proof2).

% Attempt to solve each part of a disjunction ('Goal1; Goal2').
solve((Goal1 ; _), or_l(Proof1)) :-    solve(Goal1, Proof1).
solve((_ ; Goal2), or_r(Proof2)) :- !, solve(Goal2, Proof2).

% Attempt to solve 'Goal' by finding a corresponding rule.
solve(Goal, Proof) :- rule_head(Goal),!,
  % Look for a clause where 'Goal' matches the head.
  clause(Goal, Body, Ref),
  clause(HeadC, BodyC, Ref),
  % Solve the body of the clause as the next step in the proof.
  solve_body(HeadC, BodyC, Goal, Body, Proof).

% Check if 'Goal' matches a known fact in the database.
solve(Goal, fact(Goal)) :- fact_head(Goal),!, no_repeats_g(Goal).
% Check if 'Goal' is a built-in Prolog predicate and solve directly.
solve(Goal, built_in(Goal)) :-
    % Avoid repeats for built-in predicates.
    no_repeats_g(Goal).

% Helper to avoid repeating solutions for ground goals.
no_repeats_g(G):- ground(G),!,once(G).
% For non-ground goals, use 'no_repeats' to prevent duplicate solutions.
no_repeats_g(G):- no_repeats(G).

% Differentiate solving for direct fact matches versus implications.
solve_body(HeadC,    _,  Goal, true, rfact(Goal)):- HeadC  == Goal,!.
solve_body(HeadC,    _,  Goal, true, (rule(HeadC),implied(Goal))):- HeadC \== Goal,!.
% For other cases, solve the body to continue building the proof.
solve_body(HeadC, BodyC, Goal, Body, (proven(Goal) --> ((rule(HeadC):-BodyC), Proof))):-
  % Recurse into the body of the rule for proof.
  solve(Body, Proof).


% ==========================================
% UTILITY: second_is_better_than_first/2
% ==========================================

% Heuristic for prioritizing goal order based on variable binding and complexity.
second_is_better_than_first(Hypo1, Hypo2) :-
    % Count the unbound variables in each hypothesis.
    term_variables(Hypo1,TV1),length(TV1,C1),
    term_variables(Hypo2,TV2),length(TV2,C2),
    second_is_better_than_first(Hypo1, Hypo2, TV1, TV2 , C1, C2).

% If ground it dont matter
second_is_better_than_first(_Hypo1, _Hypo2, _TV1, _TV2 , 0, 0):- !, fail.
% same number of vars (estimate solution size)
second_is_better_than_first(Hypo1, Hypo2, _TV1, _TV2 , N, N):- !,
    second_is_smaller_than_first(Hypo1, Hypo2).
% Prefer hypotheses with fewer unbound variables.
second_is_better_than_first(_Hypo1, _Hypo2, _TV1, _TV2 , C1, C2):-
    C1\=C2, !, C1>C2.

% Use the size of predicates to decide order, preferring smaller predicates.
second_is_smaller_than_first(Hypo1, Hypo2) :-
    predicate_size(Hypo1, Size1),
    predicate_size(Hypo2, Size2), !,
    Size2 < Size1.

% Calculate the "size" or complexity of a hypothesis.
predicate_size(Hypo, Size):-
  predicate_size([], Hypo, Size).

% Check if a function is already considered in the size calculation.
f_memberchk(F,List):- \+ \+ (member(E,List),E=@=F).

% Calculate the size based on structure and grounding.
predicate_size(_Completed, Hypo, Size) :- \+ callable(Hypo), !, Size=1.
predicate_size(Completed, Hypo, Size):- f_memberchk(Hypo, Completed), !, Size=0.
predicate_size(_Completed, Hypo, Size):- ground(Hypo),!, Size=1.
predicate_size(Completed, (Hypo1, Hypo2), Size):- !,
  % Sum the sizes of individual components in a conjunction.
  predicate_size(Completed, Hypo1, S1),
  predicate_size(Completed, Hypo2, S2),
  size_sm(S1, S2, SizeM), Size is SizeM + 1,!.
predicate_size(Completed, (Hypo1;Hypo2), Size):-
  % For disjunctions, add the sizes of the options.
  predicate_size(Completed, Hypo1, S1),
  predicate_size(Completed, Hypo2, S2),
  Size is S1+S2,!.
predicate_size(Completed, Hypo, Size):-
   % Account for the number of clauses and rules in determining size.
   predicate_property(Hypo, number_of_clauses(F)),
   predicate_property(Hypo, number_of_rules(R)),
   predicate_size(Completed, Hypo, F, R, Size),!.
predicate_size(_Completed, _Hypo, 1).

% Helper functions for size comparison.
size_sm(A, B, A) :- B =:= 0, !.
size_sm(A, B, B) :- A =:= 0, !.
size_sm(A, B, A) :- A =< B.
size_sm(A, B, B) :- B < A.


% Handle the case where a predicate is defined entirely by facts.
predicate_size(_Completed, _, F, 0, F):-!.
    % When the number of rules (R) is 0, implying the predicate is defined only by facts,
    % the size is directly the number of facts (F).

% Handle the case where a predicate is defined entirely by rules.
predicate_size(Completed, Hypo, F, R, Size):- F == R, !,
    % Calculate the size based on the total extent of these rules.
    rule_extent(Completed, Hypo, Size).
    % If a predicate is defined only by rules (when F is equal to R),
    % determine its size by calculating the extent of these rules.

% Handle predicates defined by a mix of rules and facts.
predicate_size(Completed, Hypo, F, R, Size):-
    % Calculate the rule extent to understand the complexity added by rules.
    rule_extent(Completed, Hypo, RSize),
    % Adjust the size by subtracting rule count from fact count and adding calculated rule size.
    Size is F - R + RSize.
    % For predicates defined by both facts and rules, the size calculation adjusts
    % for the rule complexity (RSize) on top of the base fact count (F), corrected by the rule count (R).

% Calculate the "extent" of rules for a hypothesis, considering no prior completions.
rule_extent(Hypo, Size):-
    rule_extent([], Hypo, Size).
    % This entry point allows for calculating the rule extent of a hypothesis from scratch,
    % initializing the 'Completed' list as empty.

% Avoid recalculating the size for a hypothesis already considered.
rule_extent(Completed, Hypo, Size):- f_memberchk(Hypo, Completed), !, Size=0.
    % If 'Hypo' is already in the 'Completed' list, its size is considered as 0 to avoid
    % double-counting, indicating it's already been accounted for in the complexity calculation.

% Calculate the rule extent for a hypothesis not yet completed.
rule_extent(Completed, Hypo, Size):-
    % Find the size of each clause's body that isn't trivially true, indicating rule complexity.
    findall(S1ze, (clause(Hypo, Body), Body\==true, predicate_size([Hypo|Completed], Body, S1ze)), SizeL),
    % Sum these sizes to get the total complexity contributed by the hypothesis' rules.
    sumlist(SizeL, Size), !.
    % For each rule defining 'Hypo', calculate the complexity of its body, excluding
    % trivial truths. The total rule complexity (Size) is the sum of these individual complexities.

% Default case for rule extent calculation when no specific conditions are met.
rule_extent(_,_Hypo, 0).
    % If none of the specific conditions for calculating rule extent are met,
    % default the rule extent to 0. This might occur if the hypothesis doesn't match
    % any rules or if all rules are trivially true.