]> Wikimedia Canada | Git repositories - eccc_to_commons.git/blob - dllist.sh
Rewrite almanach merge logic
[eccc_to_commons.git] / dllist.sh
1 #!/bin/bash
2
3 # dllist - Set of tools to replicate Environment and Climate change Canada data
4 # on Wikimedia Commons
5 # Copyright (C) 2019-2020 Pierre Choffet
6 #
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11 #
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
16 #
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
20 STATION_INVENTORY_URL='ftp://client_climate@ftp.tor.ec.gc.ca/Pub/Get_More_Data_Plus_de_donnees/Station Inventory EN.csv'
21 XML_PREFIX='https://climate.weather.gc.ca/climate_data/bulk_data_e.html?format=xml'
22 CSV_COLUMNS=19
23
24 set -e
25
26 function generateStation() {
27 local -r cid="${1}"
28 local -ir sid="${2}"
29 local -ir monthly_start_year="${3}"
30 local -ir monthly_end_year="${4}"
31 local -ir daily_start_year="${5}"
32 local -ir daily_end_year="${6}"
33 local -ir hourly_start_year="${7}"
34 local -ir hourly_end_year="${8}"
35
36 # Generate monthly link
37 if [ "${monthly_start_year}" -ne 0 ]&&[ "${monthly_end_year}" -ne 0 ]
38 then
39 cat <<-EOF >&1
40 url = "${XML_PREFIX}&timeframe=3&stationID=${sid}"
41 output = "${OUTPUT_PREFIX}/monthly/${cid}.xml"
42
43 EOF
44 fi
45
46 # Generate daily links
47 if [ "${daily_start_year}" -ne 0 ]&&[ "${daily_end_year}" -ne 0 ]
48 then
49 for year in $(seq ${daily_start_year} ${daily_end_year})
50 do
51 cat <<-EOF >&1
52 url = "${XML_PREFIX}&timeframe=2&stationID=${sid}&Year=${year}&Month=1"
53 output = "${OUTPUT_PREFIX}/daily/${cid}/${year}.xml"
54
55 EOF
56 done
57 fi
58
59 # Generate hourly links
60 if [ "${hourly_start_year}" -ne 0 ]&&[ "${hourly_end_year}" -ne 0 ]
61 then
62 for year in $(seq ${hourly_start_year} ${hourly_end_year})
63 do
64 for month in $(seq 1 12)
65 do
66 # Avoid future stats
67 if [ "${year}" -eq "${NOW_YEAR}" ]&&[ "${month}" -gt "${NOW_MONTH}" ]
68 then
69 continue
70 fi
71
72 cat <<-EOF >&1
73 url = "${XML_PREFIX}&timeframe=1&stationID=${sid}&Year=${year}&Month=${month}&Day=1"
74 output = "${OUTPUT_PREFIX}/hourly/${cid}/${year}-$(printf "%02i" ${month}).xml"
75
76 EOF
77 done
78 done
79 fi
80
81 # Generate almanac link
82 cat <<-EOF >&1
83 url = "${XML_PREFIX}&timeframe=4&stationID=${sid}"
84 output = "${OUTPUT_PREFIX}/almanac/${cid}.xml"
85
86 EOF
87 }
88
89
90 # Check user input
91 if [ "${1}" == '' ]
92 then
93 echo "No download folder given"
94 exit 1
95 else
96 OUTPUT_PREFIX=$(realpath "${1}")
97 fi
98
99 # Get stations inventory
100 STATION_INVENTORY_PATH="$(mktemp)"
101 EXPECTED_COMMAS_COUNT="$((${CSV_COLUMNS} - 1))"
102 EXPECTED_QUOTES_COUNT="$((${CSV_COLUMNS} * 2))"
103
104 # Date
105 NOW_YEAR=$(date +%Y)
106 NOW_MONTH=$(date +%m)
107
108 curl "${STATION_INVENTORY_URL}" > "${STATION_INVENTORY_PATH}"
109
110 # Read stations ids
111 HEADER=1
112 while IFS= read -r station
113 do
114 # WARNING: This is a very naive reading of a csv line. Add some sanity checks
115 # so we can fail gracefully if escaped '"' or ',' added in source.
116 COMMAS=${station//[^,]}
117 QUOTES=${station//[^\"]}
118
119 if [ ${#COMMAS} -eq ${EXPECTED_COMMAS_COUNT} ]||[ ${#QUOTES} -eq ${EXPECTED_QUOTES_COUNT} ]
120 then
121 # Parsing inside CSV content
122 if [ ${HEADER} -ne 0 ]
123 then
124 # First valid file is columns name, continue
125 HEADER=0
126 continue
127 fi
128
129 CLIMATE_ID="$(echo "${station}" | awk -F'"' '{print $6}')"
130 STATION_ID="$(echo "${station}" | awk -F'"' '{print $8}')"
131 STATION_HOURLY_START=$(echo "${station}" | awk -F'"' '{print $28}')
132 STATION_HOURLY_END=$(echo "${station}" | awk -F'"' '{print $30}')
133 STATION_DAILY_START=$(echo "${station}" | awk -F'"' '{print $32}')
134 STATION_DAILY_END=$(echo "${station}" | awk -F'"' '{print $34}')
135 STATION_MONTHLY_START=$(echo "${station}" | awk -F'"' '{print $36}')
136 STATION_MONTHLY_END=$(echo "${station}" | awk -F'"' '{print $38}')
137 generateStation "${CLIMATE_ID}" "${STATION_ID}" "${STATION_MONTHLY_START}" "${STATION_MONTHLY_END}" "${STATION_DAILY_START}" "${STATION_DAILY_END}" "${STATION_HOURLY_START}" "${STATION_HOURLY_END}"
138 else
139 if [ "${HEADER}" -eq 0 ]
140 then
141 # NOTE: Script requests CSV contains 19 columns. If structure changed, please warn the developers.
142 echo "Structure of station inventory file changed. See comment above line ${LINENO} for more explanation. Exiting." >&2
143 exit 1
144 else
145 # Inside header, we can continue
146 continue
147 fi
148
149 fi
150 done < "${STATION_INVENTORY_PATH}"
151
152 # Clean
153 rm "${STATION_INVENTORY_PATH}"