#!/bin/bash get_line_count() { wc -l "$1" | cut -d' ' -f1 } process_file() { local INPUT_FILE=$1 local FORCE=$2 local OUTPUT_FILE="${INPUT_FILE}.metta" local HEAD=$(basename -- "$INPUT_FILE") HEAD=${HEAD/.tsv/} HEAD=${HEAD/.fb/} # HEAD=${HEAD%.*} # Remove the file extension HEAD=${HEAD/_fb_2[0-9][0-9][0-9]_[0-9][0-9]/} HEAD=${HEAD/2[0-9][0-9][0-9]_[0-9][0-9]/} HEAD=${HEAD/\./_} # Skip if the input file is already a .metta file if [[ "$INPUT_FILE" == *.metta ]]; then #echo "Skipping .metta file: $INPUT_FILE" return fi local input_lines=$(get_line_count "$INPUT_FILE") echo -e "Into-MeTTa: $input_lines lines. \t $INPUT_FILE " echo -en " " # Check if the output file exists and is newer than the input file if [[ "$FORCE" != "--force" ]] && [ -e "$OUTPUT_FILE" ] && [ "$OUTPUT_FILE" -nt "$INPUT_FILE" ]; then local line_count=$(get_line_count "$OUTPUT_FILE") # Calculate 80% of the input line count local threshold=$(( input_lines * 80 / 100 )) if [ "$line_count" -ge "$threshold" ] && [ "$line_count" -ge "400" ]; then echo -e "$line_count atoms. \t Skipping: $HEAD already exists and is newer." return fi fi local start_time=$(date +%s.%N) # Get the start time with nanosecond precision dos2unix -f "$INPUT_FILE" > /dev/null 2>/dev/null # Process based on the file extension case "$INPUT_FILE" in *.obo | *.json | *.fa) rm -f "$OUTPUT_FILE" echo -ne "." ###########swipl -l src/main/flybase_convert.pl -- --convert "$INPUT_FILE" --halt | tee -p "$OUTPUT_FILE" #echo "" ; echo "" #echo "swipl -l src/main/flybase_convert.pl -- --context=$HEAD --convert \"$INPUT_FILE\" --halt > \"$OUTPUT_FILE\"" #echo "" ; echo "" swipl -l src/main/flybase_convert.pl -- --context=$HEAD --convert "$INPUT_FILE" --halt #> "$OUTPUT_FILE" 2>/dev/null echo -ne "." # Create a temporary file temp_file=$(mktemp) echo -ne "." # Make sure temporary file will be deleted on script exit trap 'rm -f "$temp_file"' EXIT # Remove duplicates with awk and save to temporary file awk '!seen[$0]++' "$OUTPUT_FILE" > "$temp_file" # Overwrite the original file with the temporary file mv "$temp_file" "$OUTPUT_FILE" echo -ne "." ;; *) echo -ne "." #python3 src/mettalog.py "$INPUT_FILE" --analyze > "$OUTPUT_FILE" python3 src/panda_util.py "$INPUT_FILE" --analyze > "$OUTPUT_FILE" skip_lines=$? echo -ne "." awk -v head="$HEAD" -v skip="$skip_lines" ' function should_quote_metta(field) { if (field ~ /[ ()\n\t\r]/) { return 1 # Quote if field contains space, (, ), newline, tab, or carriage return } if (field ~ /[^[:print:]]/) { return 1 # Quote if field contains any non-printing character } if (field ~ /[" ]|'\''|\/|,|\|/) { # return 1 # Quote if field contains ", space, , /, , or | } if (field ~ /^[0-9]/ && field !~ /^[0-9]+$/) { # return 1 # Quote if field starts with a number and contains non-numeric characters } return 0 # No need to quote } BEGIN {FS="\t"; OFS=" "} { # Skip lines that start with whitespace (if any) followed by # or ; if (/^[[:space:]]*[#;]/) { next } # Skip lines that does not contain a tab character (thus having less than 2 fields) if (NF < 2) { next } # Construct the line to check for uniqueness line_output = "(" head non_blank_found = 0 for (i = 1; i <= NF; i++) { # Remove all control characters except newline gsub(/[[:cntrl:]]/, "", $i) # Escape double quotes to avoid breaking the output syntax gsub(/"/, "\\\"", $i) if ($i == "" || $i ~ /^[[:space:]]*$/) { # Represent fields that are only whitespace as () line_output = line_output " ()" } else { # Determine if we should quote this field if (should_quote_metta($i)) { line_output = line_output " \"" $i "\"" } else { line_output = line_output " " $i } non_blank_found = 1 } } # Skip this line if it is a duplicate if (non_blank_found && !(line_output in printed_lines)) { if (skip > 0) { skip-- } else { print line_output ")" printed_lines[line_output] = 1 } } } ' "$INPUT_FILE" >> "$OUTPUT_FILE" ;; esac local end_time=$(date +%s.%N) # Get the end time with nanosecond precision local elapsed_time=$(echo "$end_time - $start_time" | bc) # Calculate elapsed time using bc elapsed_time=$(printf "%.2f" "$elapsed_time") # Format the elapsed time local output_lines=$(get_line_count "$OUTPUT_FILE") echo -e "$output_lines atoms. \t Converted $HEAD in $elapsed_time seconds." } # Check if any arguments are provided if [ "$#" -eq 0 ]; then echo "Usage: $0 [--force] " exit 1 fi FORCE="" ARGS=() # Separate the --force flag from the other arguments for arg in "$@"; do if [[ "$arg" == "--force" ]]; then FORCE="--force" else ARGS+=("$arg") fi done # Process each argument for INPUT_PATH in "${ARGS[@]}"; do if [ -d "$INPUT_PATH" ]; then # Start timing for directory processing dir_start_time=$(date +%s.%N) echo "Processing TSV files: $INPUT_PATH" find "$INPUT_PATH" -type f -name "*.*" -not -name "*.metta" -not -name "*.obo" -not -name "*.json" -not -name "*.fa" -print0 | while IFS= read -r -d $'\0' file; do process_file "$file" "$FORCE" done echo "Processing Special files: $INPUT_PATH" find "$INPUT_PATH" -type f \( -name "*.obo" -or -name "*.json" -or -name "*.fa" \) -print0 | while IFS= read -r -d $'\0' file; do echo process_file "$file" "$FORCE" done # End timing for directory processing dir_end_time=$(date +%s.%N) dir_elapsed_time=$(echo "$dir_end_time - $dir_start_time" | bc) dir_elapsed_time=$(printf "%.2f" "$dir_elapsed_time") echo "Completed processing of directory '$INPUT_PATH' in $dir_elapsed_time seconds." elif [ -f "$INPUT_PATH" ]; then # Show elapsed time for individual files process_file "$INPUT_PATH" "$FORCE" head -100 "$INPUT_PATH".metta tail -10 "$INPUT_PATH".metta else echo "Error: '$INPUT_PATH' is not a valid file or directory." fi done