WSL/SLF GitLab Repository

resample_smetdata.sh 7.82 KB
Newer Older
1
2
3
4
#!/bin/bash
#
# This script can resample SMET data to lower resolutions.
#
5
# General recipe: PSUM is always an average (to keep [mm/h]), other variables can be averaged with the switch -m, or else they are just taken on the resampled time stamp.
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
#
# Use bash resample_smetdata.sh <filename> <resolution> <-m>
#
#
# Author: Nander Wever
#

# Get command line parameters
filename=$1								#SMET filename
resolution=$2								#output resolution
resolution_minutes=`echo ${resolution} | awk '{print int($1/60)}'`	#output resolution in minutes
if [ -z "$3" ]; then
	resamplemethod=0		#Just resample
else
	if [ "$3" == "-m" ]; then
		resamplemethod=1	#Take mean
	else
		resamplemethod=0	#Just resample
	fi
fi

# Check command line parameters
if [ -z "${filename}" ]; then
	echo "ERROR: no file name specified."
	echo "Use: bash resample_smetdata.sh <filename> <resolution> <-m>"
	echo "  <filename>: SMET file"
	echo "  <resolution>: new resolution in seconds. (minimum 2 minutes, maximum 1 day)"
33
	echo "  <-m>: optional, if -m is added, the mean values are taken, else it is just resampled. PSUM is always an average."
34
35
	echo "Output is written to std out."
	echo "Note: - holes in the data are filled with nodata values, and then resampled."
36
	echo "      - script assumes UTC time zone (without DST)."
37
	echo "      - when an error is encountered, the script tries to output the SMET file at the original resolution."
38
39
40
41
42
43
44
45
	exit
fi

if [ -z "${resolution}" ]; then
	echo "ERROR: no resolution specified."
	echo "Use: bash resample_smetdata.sh <filename> <resolution> <-m>"
	echo "  <filename>: SMET file"
	echo "  <resolution>: new resolution in seconds. (minimum 2 minutes, maximum 1 day)"
46
	echo "  <-m>: optional, if -m is added, the mean values are taken, else it is just resampled. PSUM is always an average."
47
48
	echo "Output is written to std out."
	echo "Note: - holes in the data are filled with nodata values, and then resampled."
49
	echo "      - script assumes UTC time zone (without DST)."
50
	echo "      - when an error is encountered, the script tries to output the SMET file at the original resolution."
51
52
53
	exit
fi

54
55
# Set shell time to UTC, to allow correct parsing by awk-mktime
export TZ=UTC
56

57
58
59
60
61
62
63
64
65
66
# Dump header
cat ${filename} | grep -v ^[0-9]

# Now determine some info needed to make a complete file (without holes in the data)
firsttimestamp=`cat ${filename} | grep ^[0-9] | head -1 | sed 's/[-T:]/ /g' | awk '{print mktime(sprintf("%04d %02d %02d %02d %02d %02d %d", $1, $2, $3, $4, $5, 0, 1))}'`
timeresolution=`cat ${filename} | grep ^[0-9] | awk '{if (NR==1) {printf "%s\n", $1} else {printf "%s\n%s\n", $1, $1}}' | sed '$!N;s/\n/ /' | sed 's/[-T:]/ /g' | awk '(NF==10) {print mktime(sprintf("%04d %02d %02d %02d %02d %02d %d", $6, $7, $8, $9, $10, 0, 1))-mktime(sprintf("%04d %02d %02d %02d %02d %02d %d", $1, $2, $3, $4, $5, 0, 1))}' | sort -nk1 | uniq -c | sort -nrk1 | awk '(NR==1){print $2}'`   # Native resolution of file is determined by the difference between two time stamps that occurs most often.
lasttimestamp=`cat ${filename} | grep ^[0-9] | tail -1 | sed 's/[-T:]/ /g' | awk '{print mktime(sprintf("%04d %02d %02d %02d %02d %02d %d", $1, $2, $3, $4, $5, 0, 1))}'`
nodatavalue=`cat ${filename} | grep ^nodata | head -1 | awk -F= '{print $NF}' | sed 's/ //g'`
nsensors=`cat ${filename} | grep ^[0-9] | head -1 | awk '{print NF-1}'`
col_psum=`cat ${filename} | grep ^fields | head -1 | awk -F= '{print $NF}' | tr ' ' '\n' | grep -v ^$ | grep -n PSUM | awk -F: '{print $1}'`
67
68
69
if [ -z "${col_psum}" ]; then
	col_psum=-1
fi
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90

# If time resolution of file is larger than requested resolution, just give the output, and send error message to stdout
if (( $timeresolution > ${resolution} )); then
	cat ${filename} | grep ^[0-9]
	echo "ERROR: requested resolution smaller than original resolution." 1>&2
	exit
fi

# If time resolution of file equals the requested resolution, just give the output
if (( $timeresolution == ${resolution} )); then
	cat ${filename} | grep ^[0-9]
	exit
fi

# If time resolution is more than one day, give error and just give the output equal to the input.
if (( $resolution > 86400 )); then
	cat ${filename} | grep ^[0-9]
	echo "ERROR: requested resolution larger than 1 day (86400 seconds). This script can't handle that." 1>&2
	exit
fi

91
92
# Resample, just plain resampling
if (( ${resamplemethod} == 0 )); then
93
94
95
96
97
98
99
	#cat: start pipe
	#grep: only select data rows from SMET
	#awk: add a 0 in the first column to mark original data. Then, add the resolution of the SMET file nodata values, marked by a 1 in the first column.
	#sort: then sort, first for the timestamp, then for the first column, such that when original data is available, it is appearing first, before the nodata values.
	#cut: removes the first column, which contains the flag.
	#uniq: now take unique timestamps. Because this selects the first occurrence of the time stamp, the way we sorted it, makes the original data selected first (if available), and then the nodata values.
	#awk: this is the actual resampling.
100
101
102
103
	#     note that the awk constuction is very similar. The trick is that for resampling {sum[k]=$k; n[k]=1;} is used, where for taking the mean {sum[k]+=$k; n[k]+=1;} is used.
	#     note that the psum also should be calculated as a mean, else it makes no sense. So for calculating the mean, the if-statement {if(k=='${col_psum}')} is actually superfluous (both the if and
	#     the else block do exactly the same), but it is kept for coherence.
	cat ${filename} | grep ^[0-9] | awk '{print 0, $0} END {for(j='${firsttimestamp}'; j<='${lasttimestamp}'; j=j+'${timeresolution}') {printf "1 %s", strftime("%Y-%m-%dT%H:%M", j); for (i=1; i<='${nsensors}'; i++) {printf " %s", '${nodatavalue}'} printf "\n"}}' | sort -k 2 -k 1 | cut -d\  -f2- | uniq -w16 | awk '{for(k=2; k<=NF; k++) {if($k!='${nodatavalue}') {if(k=='${col_psum}') {sum[k]+=$k; n[k]+=1} else {sum[k]=$k; n[k]=1;}}};     if((substr($1, 9, 2)*60*24+substr($1, 12, 2)*60+substr($1, 15, 2))%'${resolution_minutes}'==0) {{printf "%s", $1; for (k=2; k<=NF; k++) {printf " %s", (n[k]>0)?sum[k]/n[k]:'${nodatavalue}'; sum[k]=0; n[k]=0}; printf "\n"}}}'
104
105
fi

106
107
# Resample, calculating mean values
if (( ${resamplemethod} == 1 )); then
108
109
110
111
112
113
114
	#cat: start pipe
	#grep: only select data rows from SMET
	#awk: add a 0 in the first column to mark original data. Then, add the resolution of the SMET file nodata values, marked by a 1 in the first column.
	#sort: then sort, first for the timestamp, then for the first column, such that when original data is available, it is appearing first, before the nodata values.
	#cut: removes the first column, which contains the flag.
	#uniq: now take unique timestamps. Because this selects the first occurrence of the time stamp, the way we sorted it, makes the original data selected first (if available), and then the nodata values.
	#awk: this is the actual resampling.
115
116
117
118
	#     note that the awk constuction is very similar. The trick is that for resampling {sum[k]=$k; n[k]=1;} is used, where for taking the mean {sum[k]+=$k; n[k]+=1;} is used.
	#     note that the psum also should be calculated as a mean, else it makes no sense. So for calculating the mean, the if-statement {if(k=='${col_psum}')} is actually superfluous (both the if and
	#     the else block do exactly the same), but it is kept for coherence.
	cat ${filename} | grep ^[0-9] | awk '{print 0, $0} END {for(j='${firsttimestamp}'; j<='${lasttimestamp}'; j=j+'${timeresolution}') {printf "1 %s", strftime("%Y-%m-%dT%H:%M", j); for (i=1; i<='${nsensors}'; i++) {printf " %s", '${nodatavalue}'} printf "\n"}}' | sort -k 2 -k 1 | cut -d\  -f2- | uniq -w16 | awk '{for(k=2; k<=NF; k++) {if($k!='${nodatavalue}') {if(k=='${col_psum}') {sum[k]+=$k; n[k]+=1} else {sum[k]+=$k; n[k]+=1;}}};     if((substr($1, 9, 2)*60*24+substr($1, 12, 2)*60+substr($1, 15, 2))%'${resolution_minutes}'==0) {{printf "%s", $1; for (k=2; k<=NF; k++) {printf " %s", (n[k]>0)?sum[k]/n[k]:'${nodatavalue}'; sum[k]=0; n[k]=0}; printf "\n"}}}'
119
fi
120

121
#Reset time zone
122
unset TZ