X-Git-Url: https://git.wikimedia.ca/?a=blobdiff_plain;f=eccc_to_commons.sh;fp=eccc_to_commons.sh;h=c580fbbc6518c99ba474e034e8887a7b9b7c1386;hb=86faa15d147318dbfdabdaf1d04efb8a59ff7051;hp=0000000000000000000000000000000000000000;hpb=d5920c8c99f839f2081236b0174717f0fb40d885;p=eccc_to_commons.git diff --git a/eccc_to_commons.sh b/eccc_to_commons.sh new file mode 100755 index 0000000..c580fbb --- /dev/null +++ b/eccc_to_commons.sh @@ -0,0 +1,106 @@ +#!/bin/bash + +# eccc_to_commons - Batch convert Environment and Climate change Canada +# historical XML data into a JSON format suitable for +# Wikimedia Commons. +# Copyright (C) 2019-2020 Pierre Choffet +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +set -e + +SOURCE="${1}" +DESTINATION="${2}" + +if [ -z "${SOURCE}" ]||[ -z "${DESTINATION}" ]||[ ! -d "${DESTINATION}" ] +then + echo 'Fix XMLs provided by Environment and Climate change Canada' + echo 'Usage: eccc_to_commons.sh ' + exit 1 +fi + +# Loop on xml files in source folder +while IFS= read -r -d '' -u 9 +do + # Check XML type + declare -i STATIONDATA_COUNT=$(xmlstarlet sel -t -v 'count(//stationdata)' "${REPLY}") + declare -i YEAR_COUNT=$(xmlstarlet sel -t -v 'count(//stationdata[@year])' "${REPLY}") + declare -i MONTH_COUNT=$(xmlstarlet sel -t -v 'count(//stationdata[@month])' "${REPLY}") + declare -i DAY_COUNT=$(xmlstarlet sel -t -v 'count(//stationdata[@day])' "${REPLY}") + declare -i HOUR_COUNT=$(xmlstarlet sel -t -v 'count(//stationdata[@hour])' "${REPLY}") + declare -i MINUTE_COUNT=$(xmlstarlet sel -t -v 'count(//stationdata[@minute])' "${REPLY}") + declare -i AL_MONTH_COUNT=$(xmlstarlet sel -t -v 'count(//month)' "${REPLY}") + + # Detect station id + declare -i LAST_DIR=$(basename $(dirname "${REPLY}")) + declare -i FILENAME=$(basename "${REPLY%.*}") + + if [ "${LAST_DIR}" -gt 0 ] + then + # Station id is in last directory name + STATION_ID="${LAST_DIR}" + elif [ "${FILENAME}" -gt 0 ] + then + # Station id is in file name + STATION_ID="${FILENAME}" + else + echo "${REPLY}: Cannot detect station id" + exit 1 + fi + + if [ ${AL_MONTH_COUNT} -eq 0 ]&& \ + [ ${STATIONDATA_COUNT} -gt 0 ]&& \ + [ ${STATIONDATA_COUNT} -eq ${YEAR_COUNT} ]&& \ + [ ${STATIONDATA_COUNT} -eq ${MONTH_COUNT} ]&& \ + [ ${STATIONDATA_COUNT} -eq ${DAY_COUNT} ]&& \ + [ ${STATIONDATA_COUNT} -eq ${HOUR_COUNT} ]&& \ + [ ${STATIONDATA_COUNT} -eq ${MINUTE_COUNT} ] + then + echo "${REPLY}: Hourly data not compatible yet. Ignoring." >&2 + continue + elif [ ${AL_MONTH_COUNT} -eq 0 ]&& \ + [ ${STATIONDATA_COUNT} -gt 0 ]&& \ + [ ${STATIONDATA_COUNT} -eq ${YEAR_COUNT} ]&& \ + [ ${STATIONDATA_COUNT} -eq ${MONTH_COUNT} ]&& \ + [ ${STATIONDATA_COUNT} -eq ${DAY_COUNT} ]&& \ + [ ${HOUR_COUNT} -eq 0 ]&& \ + [ ${MINUTE_COUNT} -eq 0 ] + then + echo "${REPLY}: Daily data not compatible yet. Ignoring." >&2 + continue + elif [ ${AL_MONTH_COUNT} -eq 0 ]&& \ + [ ${STATIONDATA_COUNT} -gt 0 ]&& \ + [ ${STATIONDATA_COUNT} -eq ${YEAR_COUNT} ]&& \ + [ ${STATIONDATA_COUNT} -eq ${MONTH_COUNT} ]&& \ + [ ${DAY_COUNT} -eq 0 ]&& \ + [ ${HOUR_COUNT} -eq 0 ]&& \ + [ ${MINUTE_COUNT} -eq 0 ] + then + STYLESHEET_PATH='monthly_to_commons.xslt' + DESTINATION_PATH="${DESTINATION}/weather.gc.ca/Monthly/${STATION_ID}.tab" + elif [ ${STATIONDATA_COUNT} -eq 0 ]&& \ + [ ${AL_MONTH_COUNT} -gt 0 ] + then + echo "${REPLY}: Almanac data not compatible yet. Ignoring." >&2 + continue + DESTINATION_PATH="${DESTINATION}/weather.gc.ca/Almanac/${STATION_ID}.tab" + else + echo "${REPLY}: Cannot detect file type. Exiting." + exit 1 + fi + + echo "Processing ${REPLY} to ${DESTINATION_PATH}…" >&2 + mkdir -p "$(dirname "${DESTINATION_PATH}")" + xmlstarlet tr "${STYLESHEET_PATH}" "${REPLY}" > "${DESTINATION_PATH}" +done 9< <( find "${SOURCE}" -type f -name '*.xml' -print0 )