]> Wikimedia Canada | Git repositories - eccc_to_commons.git/blob - eccc_to_commons.sh
Rewrite almanach merge logic
[eccc_to_commons.git] / eccc_to_commons.sh
1 #!/bin/bash
2
3 # eccc_to_commons - Batch convert Environment and Climate change Canada
4 # historical XML data into a JSON format suitable for
5 # Wikimedia Commons.
6 # Copyright (C) 2019-2020 Pierre Choffet
7 #
8 # This program is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version.
12 #
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
17 #
18 # You should have received a copy of the GNU General Public License
19 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20
21 set -e
22
23 SOURCE="${1}"
24 DESTINATION="${2}"
25
26 if [ -z "${SOURCE}" ]||[ -z "${DESTINATION}" ]||[ ! -d "${DESTINATION}" ]
27 then
28 echo 'Fix XMLs provided by Environment and Climate change Canada'
29 echo 'Usage: eccc_to_commons.sh <source folder> <destination folder>'
30 exit 1
31 fi
32
33 # Loop on xml files in source folder
34 while IFS= read -r -d '' -u 9
35 do
36 # Check XML type
37 declare -i STATIONDATA_COUNT=$(xmlstarlet sel -t -v 'count(//stationdata)' "${REPLY}")
38 declare -i YEAR_COUNT=$(xmlstarlet sel -t -v 'count(//stationdata[@year])' "${REPLY}")
39 declare -i MONTH_COUNT=$(xmlstarlet sel -t -v 'count(//stationdata[@month])' "${REPLY}")
40 declare -i DAY_COUNT=$(xmlstarlet sel -t -v 'count(//stationdata[@day])' "${REPLY}")
41 declare -i HOUR_COUNT=$(xmlstarlet sel -t -v 'count(//stationdata[@hour])' "${REPLY}")
42 declare -i MINUTE_COUNT=$(xmlstarlet sel -t -v 'count(//stationdata[@minute])' "${REPLY}")
43 declare -i AL_MONTH_COUNT=$(xmlstarlet sel -t -v 'count(//month)' "${REPLY}")
44
45 # Detect climate id
46 declare LAST_DIR=$(basename $(dirname "${REPLY}"))
47 declare FILENAME=$(basename "${REPLY%.*}")
48
49 if [[ "${LAST_DIR}" =~ ^[A-Z0-9]{7}$ ]]
50 then
51 # Climate id is in last directory name
52 CLIMATE_ID="${LAST_DIR}"
53 elif [[ "${FILENAME}" =~ ^[A-Z0-9]{7}$ ]]
54 then
55 # Climate id is in file name
56 CLIMATE_ID="${FILENAME}"
57 else
58 echo "${REPLY}: Cannot detect climate id"
59 exit 1
60 fi
61
62 if [ ${AL_MONTH_COUNT} -eq 0 ]&& \
63 [ ${STATIONDATA_COUNT} -gt 0 ]&& \
64 [ ${STATIONDATA_COUNT} -eq ${YEAR_COUNT} ]&& \
65 [ ${STATIONDATA_COUNT} -eq ${MONTH_COUNT} ]&& \
66 [ ${STATIONDATA_COUNT} -eq ${DAY_COUNT} ]&& \
67 [ ${STATIONDATA_COUNT} -eq ${HOUR_COUNT} ]&& \
68 [ ${STATIONDATA_COUNT} -eq ${MINUTE_COUNT} ]
69 then
70 echo "${REPLY}: Hourly data not compatible yet. Ignoring." >&2
71 continue
72 elif [ ${AL_MONTH_COUNT} -eq 0 ]&& \
73 [ ${STATIONDATA_COUNT} -gt 0 ]&& \
74 [ ${STATIONDATA_COUNT} -eq ${YEAR_COUNT} ]&& \
75 [ ${STATIONDATA_COUNT} -eq ${MONTH_COUNT} ]&& \
76 [ ${STATIONDATA_COUNT} -eq ${DAY_COUNT} ]&& \
77 [ ${HOUR_COUNT} -eq 0 ]&& \
78 [ ${MINUTE_COUNT} -eq 0 ]
79 then
80 echo "${REPLY}: Daily data not compatible yet. Ignoring." >&2
81 continue
82 elif [ ${AL_MONTH_COUNT} -eq 0 ]&& \
83 [ ${STATIONDATA_COUNT} -gt 0 ]&& \
84 [ ${STATIONDATA_COUNT} -eq ${YEAR_COUNT} ]&& \
85 [ ${STATIONDATA_COUNT} -eq ${MONTH_COUNT} ]&& \
86 [ ${DAY_COUNT} -eq 0 ]&& \
87 [ ${HOUR_COUNT} -eq 0 ]&& \
88 [ ${MINUTE_COUNT} -eq 0 ]
89 then
90 STYLESHEET_PATH='monthly_to_commons.xslt'
91 DESTINATION_PATH="${DESTINATION}/weather.gc.ca/Monthly/${CLIMATE_ID}.tab"
92 elif [ ${STATIONDATA_COUNT} -eq 0 ]&& \
93 [ ${AL_MONTH_COUNT} -gt 0 ]
94 then
95 # Check file contains data
96 declare -i AL_DAY_COUNT=$(xmlstarlet sel -t -v 'count(//day)' "${REPLY}")
97 if [ ${AL_DAY_COUNT} -eq 0 ]
98 then
99 echo "${REPLY}: No day found. Ignoring." >&2
100 continue
101 fi
102
103 STYLESHEET_PATH='almanac_to_commons.xslt'
104 DESTINATION_PATH="${DESTINATION}/weather.gc.ca/Almanac/${CLIMATE_ID}.tab"
105 else
106 echo "${REPLY}: Cannot detect file type. Exiting."
107 exit 1
108 fi
109
110 echo "Processing ${REPLY} to ${DESTINATION_PATH}…" >&2
111 mkdir -p "$(dirname "${DESTINATION_PATH}")"
112 xmlstarlet tr "${STYLESHEET_PATH}" "${REPLY}" | jq > "${DESTINATION_PATH}"
113 done 9< <( find "${SOURCE}" -type f -name '*.xml' -print0 )