fsa - A file system analyzer for finding duplicates and analyzing disk usage

#!/usr/bin/env bash
help_text="
NAME
    fsa - File system analyzer and duplicate file finder.

USAGE
    fsa [options]

OPTIONS
    -r|--reuse
        Reuse the previous scan instead of running a new one.

    -h|--help
        Show help text.

COMMANDS
    c           Show file count by type (file/directory).
    ds          Show directory sizes.
    fd          Find duplicate files (same name and size).
    f           Show current filter settings.
    f fn <pat>  Set filename filter pattern.
    f ms <size> Set minimum size filter.
    lg          List 100 largest files.
    sq          Open SQLite3 shell for custom queries.
    !<cmd>      Run a shell command.
    h|help      Show command help.
    q|quit      Quit the program.

DESCRIPTION
    A file system analyzer that scans directories and stores file metadata
    in a SQLite database for querying. Useful for finding duplicate files,
    identifying large files consuming disk space, and analyzing directory
    structure. Scans are cached in /tmp and can be reused. Automatically
    excludes .git, node_modules, and oracle_client directories.

AUTHOR
    mjnurse.github.io - 2020
"
help_line="File system analyzer and duplicate file finder"
web_desc_line="A file system analyzer for finding duplicates and analyzing disk usage"

try="Try ${0##*/} -h for more information"
tmp="${help_text##*USAGE}"
usage=$(echo "Usage: ${tmp%%OPTIONS*}" | tr -d "\n" | sed "s/  */ /g")

div="#@&#"
reuse_yn=n

while [[ "$1" != "" ]]; do
    case $1 in
        -h|--help)
            echo "$help_text"
            exit
            ;;
        -r|--reuse)
            reuse_yn=y
            ;;
    esac
    shift
done

if [[ "$reuse_yn" == "y" ]]; then
    tmp="$(ls /tmp/fsa:*.db)"
    file="${tmp/.db/}"
else
    if [[ "$(ls /tmp/fsa:* 2> /dev/null | wc -l )" == "1" ]]; then
        tmp="$(ls /tmp/fsa:*.db)"
        file="${tmp/.db/}"
        echo "Scan last run: ${file##*:}"
        read -p "Reuse this scan [yN]: " reuse_yn
    fi
fi

if [[ "${reuse_yn^^}" != "Y" ]]; then
    rm -f /tmp/fsa:*
    file="/tmp/fsa:$(date +'%y%m%d-%H%M')"
    echo "Running Scan - this may take a while"
    echo "f,d,s,b,t" > $file.csv
    find . -printf "%f${div}%h${div}%s${div}%b${div}%y\n" | sed "s/,//g; s/${div}/,/g" >> $file.csv
    echo -e '.mode csv\n.import '$file.csv' files\n' | sqlite3 $file.db
    rm -f $file.csv
fi

prompt="fsa: $(pwd) > "
filename_filter="%"
min_size_filter="0"
max_depth="999"

std_filter="AND d NOT LIKE '%oracle_client%' AND d NOT LIKE '%\/.git\/%'"
std_filter="$std_filter AND d NOT LIKE '%\/node_modules\/%'"

function sq() {
    table="(SELECT * FROM files WHERE f LIKE '${filename_filter}' AND (s+0) > ${min_size_filter} $std_filter)"
    echo "$1" | sed "s/^ *//; s/{table}/$table/; s/{max_depth}/$max_depth/" | sqlite3 $file.db
}

function set_filter() {
    case $1 in
        fn)
            filename_filter="$2"
            ;;
        fs)
            min_size_filter="$2"
            ;;
    esac
}

file_count="
    .mode column
    .headers on
    SELECT CASE t
        WHEN 'f' THEN 'file'
        WHEN 'd' THEN 'directory'
        ELSE t END AS type
         , COUNT(*) num
    FROM {table} GROUP BY t;
"

file_dups="
    .mode column
    .width 40 10 60
    SELECT    f, s, details
    FROM (
        SELECT    f
                , s
                , 0
                , COUNT(*)||' ----------------------------------' AS details 
        FROM      {table}
        WHERE     t = 'f'
        GROUP BY  f
                , s
        HAVING    COUNT(*) > 1
        UNION
        SELECT    f
                , s
                , 1
                , d||'/'||f AS details
        FROM      {table}
        WHERE     (f, s) IN (
            SELECT    f, s
            FROM      {table}
            WHERE     t = 'f'
            GROUP BY  f, s
            HAVING    COUNT(*) > 1 )
        ORDER BY 1,2,3
    );"

largest_files="
    .mode column
    .headers on
    .width 40 10 60
    SELECT f, (s/1024)||'KB' AS kb, d||'/'||f AS file
    FROM   {table}
    ORDER BY (s+0)
    LIMIT 100;"

dir_sizes="
    .mode column
    .headers on
    .width 60 10 20

    SELECT  dirs.d,
            COUNT(f),
            SUM(s) 
    FROM (
        SELECT  d
        FROM    files 
        WHERE   LENGTH(d) - LENGTH(REPLACE(d, '/', '')) <= {max_depth}
        GROUP BY d) dirs
    JOIN files ON (files.d LIKE dirs.d||'%')
    GROUP BY    dirs.d
    ORDER BY    dirs.d;
"

while [ 1 ]; do
    read -e -p "$prompt" option
    history -s "$option"

    case $option in
        c)
            sq "$file_count"
            ;;
        ds)
            sq "$dir_sizes"
            ;;
        fd)
            sq "$file_dups"
            ;;
        o)
            echo "Options:"
            echo "- filename filter (fn): ${filename_filter}"
            echo "- min size filter (fs): ${min_size_filter}"
            echo "- max dir depth   (md): ${max_depth}"
            ;;
        f*)
            set_filter $option
            ;;
        h|help)
            echo "Commands:"
            echo "c      - file count by type"
            echo "ds     - directory sizes"
            echo "fd     - file duplicates"
            echo "fn     - set filename filter option"
            echo "fs     - set minimum file size filter option"
            echo "md     - set maximum directory depth option"
            echo "o      - show options"
            echo "sq     - open sqlite3"
            echo "!<cmd> - Run shell command"
            echo "q      - quit"
            ;;
        lg)
            sq "$largest_files"
            ;;
        md*)
            max_depth="${option#md }"
            ;;
        q|quit)
            exit
            ;;
        sq)
            sqlite3 $file.db
            ;;
        !*)
            ${option:1}
            ;;
        *)
            echo "No such option.  Try h for help"
            ;;
    esac
done