From: Pierre Choffet Date: Wed, 12 May 2021 20:23:12 +0000 (-0400) Subject: Add merge Bash script X-Git-Url: https://git.wikimedia.ca/?a=commitdiff_plain;h=205649e59339883769b0bc664fee1256d5816350;p=eccc_to_commons.git Add merge Bash script --- diff --git a/README b/README index 9cf1c30..1ba88d4 100644 --- a/README +++ b/README @@ -21,6 +21,7 @@ dllist.sh outputs a curl configuration file listing all availabl eccc_fixer.sh fix upstream data XML files eccc_fixer.xslt fix upstream data XML file commons_rules.xsd validate ECCC XML from a Wikimedian point of view +eccc_merger.sh merge multiple ECCC XML files eccc_to_commons.sh transform ECCC XML files into JSON monthly_to_commons.xslt transform ECCC monthly XML file into JSON almanac_to_commons.xslt transform ECCC almanac XML file into JSON @@ -130,6 +131,20 @@ Same as previously, the output should be empty. Otherwise, you must resolve every single problem before continuing. +[OPTIONAL STEP] Merge multiple XML files +Sometimes, having per station granularity is too accurate. If you need to merge +two or more XML files, you can use the eccc_merge.sh script: + + $ ./eccc_merger.sh "${ECCC_CACHE}/almanac/3050519.xml" \ + "${ECCC_CACHE}/almanac/3050520.xml" "${ECCC_CACHE}/almanac/3050521.xml" \ + "${ECCC_CACHE}/almanac/3050522.xml" "${ECCC_CACHE}/almanac/3050526.xml" \ + > banff.xml + +In order to get stations ids based on their geographical position, you can use +the eccc_map tool. A public instance is hosted online at +https://stations.wikimedia.ca/ . + + 4. Transform data into target format Here we are, here is the fun part: let's create weather data in Wikimedia Commons format. diff --git a/eccc_merger.sh b/eccc_merger.sh new file mode 100755 index 0000000..95b1682 --- /dev/null +++ b/eccc_merger.sh @@ -0,0 +1,96 @@ +#!/bin/bash + +# eccc_merger.sh - Merge Environment and Climate change Canada historical data +# Copyright (C) 2020, 2021 Pierre Choffet +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +set -ex + +# User parameters +PATHS="${@}" + +function usage() { + cat <<-EOF + Merge XMLs provided by Environment and Climate change Canada + Usage: eccc_merger.sh [ECCC XML path […]] + + At least two XML files must be provided for them to be merged (obviously…). + Subsequent files will be merged as well, result is written to standard + output. + All XMLs must be valid to commons_rules.xsd and be same type (monthly or + almanach) + EOF + exit +} + +# Auto detect input types +MERGE_TYPE='' +for path in $@ +do + # Check file exists + if [ ! -f "${path}" ] + then + echo "File ${path} doesn't exist" >&2 + exit 1 + fi + + # Check merge type + FILE_TYPE=$(xmlstarlet sel -t -i '/climatedata/month' -o almanach --elif '/climatedata/stationdata' -o monthly --else -o '' "${path}") + + if [ "${MERGE_TYPE}" != '' ] + then + if [ "${FILE_TYPE}" != "${MERGE_TYPE}" ] + then + echo 'All XMLs must be the same type' >&2 + exit 1 + fi + else + MERGE_TYPE="${FILE_TYPE}" + fi +done + +# Select stylesheet to be used +if [ "${MERGE_TYPE}" == 'almanach' ] +then + STYLESHEET='eccc_merger_almanach.xslt' +elif [ "${MERGE_TYPE}" == 'monthly' ] +then + STYLESHEET='eccc_merger_monthly.xslt' + echo 'Not working with monthly data for now. Exiting.' + exit +fi + +# Merge +FIRST_PASS=1 +for path in ${PATHS} +do + if [ "${FIRST_PASS}" -eq 1 ] + then + FIRST_PASS=0 + FIRST_PATH="$(mktemp)" + cp "${path}" "${FIRST_PATH}" + continue + fi + + SECOND_PATH="${path}" + OUT_PATH="$(mktemp)" + xmlstarlet tr "${STYLESHEET}" -s "merge-path=${SECOND_PATH}" "${FIRST_PATH}" > "${OUT_PATH}" + + rm "${FIRST_PATH}" + FIRST_PATH="${OUT_PATH}" +done + +cat "${OUT_PATH}" +rm "${OUT_PATH}"