#!/usr/bin/env bash
help_text="
NAME
parq - Read and display parquet files in various formats.
USAGE
parq [options] <parquet-file>
OPTIONS
-a|--all
Display all records (default shows 10 records).
-c|--csv
Output data in CSV format (default is JSON).
-h|--help
Show help text.
-n|--num <number>
Specify number of records to display.
-s|--schema
Display the parquet file schema only.
DESCRIPTION
parq is a utility for reading Apache Parquet files and displaying their
contents in different formats. By default, it displays the first 10 records
in JSON format. The script uses Python's pandas and pyarrow libraries to
read parquet files and can output in JSON, CSV, or display the schema.
If required Python libraries (pandas, pyarrow) are not installed, the script
will offer to install them automatically.
AUTHOR
mjnurse.github.io - 2025
"
help_line="Read and display parquet files in various formats"
web_desc_line="A bash utility for viewing Apache Parquet files in JSON, CSV, or schema format"
try="Try ${0##*/} -h for more information"
tmp="${help_text##*USAGE}"
usage=$(echo "Usage: ${tmp%%OPTIONS*}" | tr -d "\n" | sed "s/ */ /g")
if [[ "$1" == "" ]]; then
echo "${usage}"
echo "${try}"
exit 1
fi
mode=json
recs=10
while [[ "$1" != "" ]]; do
case $1 in
-a|--all) recs=0 ;;
-c|--csv) mode=csv ;;
-h|--help) echo "$help_text"; exit ;;
-n|--num) shift; recs=$1 ;;
-s|--schema) mode=schema ;;
?*) break ;;
esac
shift
done
# Check if python libraries are installed, if not offer to install
for lib in pandas pyarrow; do
if ! pip3 list 2>/dev/null | grep -q "$lib"; then
echo "Missing python library: $lib"
read -p "Install $lib? [Yn]: " yn
[[ "${yn^}" != "Y" ]] && exit
pip3 install $lib
fi
done
# Execute Python and display results
exec_py() {
python3 -c "$1" 2>/dev/null
}
show_recs() {
local total=$(wc -l < "$1")
[[ $mode == csv ]] && ((total--))
if [[ $recs == 0 ]]; then
cat "$1" | ${2:-cat}
else
head -n $((recs + (mode == csv ? 1 : 0))) "$1" | ${2:-cat}
fi
echo
echo "$([[ $recs == 0 ]] && echo $total || echo $recs of $total) total records"
}
case $mode in
csv)
tmpfile="/tmp/parq_$$.csv"
exec_py "import pandas as pd; pd.read_parquet('$1').to_csv('$tmpfile')"
show_recs "$tmpfile"
rm -f "$tmpfile"
;;
json)
tmpfile="/tmp/parq_$$.json"
exec_py "import pandas as pd, json; df=pd.read_parquet('$1'); [print(json.dumps(r)) for r in json.loads(df.to_json(orient='records'))]" > "$tmpfile"
show_recs "$tmpfile" "$(command -v jq || echo cat)"
rm -f "$tmpfile"
;;
schema)
exec_py "import pyarrow.parquet as pq; print(pq.read_table('$1').schema)" | sed -n "/-- schema/q; /./p" | grep --color=auto " .*"
;;
esac