]> Wikimedia Canada | Git repositories - eccc_to_commons.git/blobdiff - eccc_to_commons.sh
Complete process from ECCC website to Commons files generation
[eccc_to_commons.git] / eccc_to_commons.sh
diff --git a/eccc_to_commons.sh b/eccc_to_commons.sh
new file mode 100755 (executable)
index 0000000..c580fbb
--- /dev/null
@@ -0,0 +1,106 @@
+#!/bin/bash
+
+# eccc_to_commons - Batch convert Environment and Climate change Canada
+#                   historical XML data into a JSON format suitable for
+#                   Wikimedia Commons.
+# Copyright (C) 2019-2020  Pierre Choffet
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+set -e
+
+SOURCE="${1}"
+DESTINATION="${2}"
+
+if [ -z "${SOURCE}" ]||[ -z "${DESTINATION}" ]||[ ! -d "${DESTINATION}" ]
+then
+       echo 'Fix XMLs provided by Environment and Climate change Canada'
+       echo 'Usage: eccc_to_commons.sh <source folder> <destination folder>'
+       exit 1
+fi
+
+# Loop on xml files in source folder
+while IFS= read -r -d '' -u 9
+do
+       # Check XML type
+       declare -i STATIONDATA_COUNT=$(xmlstarlet sel -t -v 'count(//stationdata)' "${REPLY}")
+       declare -i YEAR_COUNT=$(xmlstarlet sel -t -v 'count(//stationdata[@year])' "${REPLY}")
+       declare -i MONTH_COUNT=$(xmlstarlet sel -t -v 'count(//stationdata[@month])' "${REPLY}")
+       declare -i DAY_COUNT=$(xmlstarlet sel -t -v 'count(//stationdata[@day])' "${REPLY}")
+       declare -i HOUR_COUNT=$(xmlstarlet sel -t -v 'count(//stationdata[@hour])' "${REPLY}")
+       declare -i MINUTE_COUNT=$(xmlstarlet sel -t -v 'count(//stationdata[@minute])' "${REPLY}")
+       declare -i AL_MONTH_COUNT=$(xmlstarlet sel -t -v 'count(//month)' "${REPLY}")
+
+       # Detect station id
+       declare -i LAST_DIR=$(basename $(dirname "${REPLY}"))
+       declare -i FILENAME=$(basename "${REPLY%.*}")
+
+       if [ "${LAST_DIR}" -gt 0 ]
+       then
+               # Station id is in last directory name
+               STATION_ID="${LAST_DIR}"
+       elif [ "${FILENAME}" -gt 0 ]
+       then
+               # Station id is in file name
+               STATION_ID="${FILENAME}"
+       else
+               echo "${REPLY}: Cannot detect station id"
+               exit 1
+       fi
+
+       if [ ${AL_MONTH_COUNT} -eq 0 ]&& \
+          [ ${STATIONDATA_COUNT} -gt 0 ]&& \
+          [ ${STATIONDATA_COUNT} -eq ${YEAR_COUNT} ]&& \
+          [ ${STATIONDATA_COUNT} -eq ${MONTH_COUNT} ]&& \
+          [ ${STATIONDATA_COUNT} -eq ${DAY_COUNT} ]&& \
+          [ ${STATIONDATA_COUNT} -eq ${HOUR_COUNT} ]&& \
+          [ ${STATIONDATA_COUNT} -eq ${MINUTE_COUNT} ]
+       then
+               echo "${REPLY}: Hourly data not compatible yet. Ignoring." >&2
+               continue
+       elif [ ${AL_MONTH_COUNT} -eq 0 ]&& \
+            [ ${STATIONDATA_COUNT} -gt 0 ]&& \
+            [ ${STATIONDATA_COUNT} -eq ${YEAR_COUNT} ]&& \
+            [ ${STATIONDATA_COUNT} -eq ${MONTH_COUNT} ]&& \
+            [ ${STATIONDATA_COUNT} -eq ${DAY_COUNT} ]&& \
+            [ ${HOUR_COUNT} -eq 0 ]&& \
+            [ ${MINUTE_COUNT} -eq 0 ]
+       then
+               echo "${REPLY}: Daily data not compatible yet. Ignoring." >&2
+               continue
+       elif [ ${AL_MONTH_COUNT} -eq 0 ]&& \
+            [ ${STATIONDATA_COUNT} -gt 0 ]&& \
+            [ ${STATIONDATA_COUNT} -eq ${YEAR_COUNT} ]&& \
+            [ ${STATIONDATA_COUNT} -eq ${MONTH_COUNT} ]&& \
+            [ ${DAY_COUNT} -eq 0 ]&& \
+            [ ${HOUR_COUNT} -eq 0 ]&& \
+            [ ${MINUTE_COUNT} -eq 0 ]
+       then
+               STYLESHEET_PATH='monthly_to_commons.xslt'
+               DESTINATION_PATH="${DESTINATION}/weather.gc.ca/Monthly/${STATION_ID}.tab"
+       elif [ ${STATIONDATA_COUNT} -eq 0 ]&& \
+            [ ${AL_MONTH_COUNT} -gt 0 ]
+       then
+               echo "${REPLY}: Almanac data not compatible yet. Ignoring." >&2
+               continue
+               DESTINATION_PATH="${DESTINATION}/weather.gc.ca/Almanac/${STATION_ID}.tab"
+       else
+               echo "${REPLY}: Cannot detect file type. Exiting."
+               exit 1
+       fi
+
+       echo "Processing ${REPLY} to ${DESTINATION_PATH}…" >&2
+       mkdir -p "$(dirname "${DESTINATION_PATH}")"
+       xmlstarlet tr "${STYLESHEET_PATH}" "${REPLY}" > "${DESTINATION_PATH}"
+done 9< <( find "${SOURCE}" -type f -name '*.xml' -print0 )