Commit 1b300534 authored by Espen Sollum's avatar Espen Sollum
Browse files

Added global error scaling

parent c3745e41
......@@ -86,7 +86,7 @@ def read_conc(self, model, dir_initconc, file_initconc, **kwargs):
xmod_ver['height'] = np.array(model.domain.heights)
# Use middle of month to represent the time of concentration data TODO: check
# Use middle of month to represent the time of concentration data
xmod_ver['time'] = datetime.strptime(file_conc, file_initconc) + timedelta(days=14)
if conc_all is None:
......@@ -94,6 +94,4 @@ def read_conc(self, model, dir_initconc, file_initconc, **kwargs):
else:
conc_all = xr.concat([conc_all, xmod_ver], 'time')
# import pdb; pdb.set_trace()
return conc_all
......@@ -7,6 +7,7 @@ from pycif.utils.geometry import dist_matrix
from pycif.utils.netcdf import readnc
from .utils.scalemaps import map2scale
import pycif.utils.check as check
from pycif.utils.check import verbose
def build_hcorrelations(zlat, zlon, lsm,
......@@ -14,7 +15,7 @@ def build_hcorrelations(zlat, zlon, lsm,
evalmin=0.5, regions=False,
hresoldim=None,
dump=False, dir_dump='', projection='gps',
tracer=None, errscalar=None,
tracer=None, cntrlv=None, glob_err=None,
**kwargs):
"""Build horizontal correlation matrix based on distance between grid
cells.
......@@ -104,10 +105,24 @@ def build_hcorrelations(zlat, zlon, lsm,
corr = np.exp(old_div(-dx, sigma))
corr[sigma <= 0] = 0
# If total error specified, scale correlation matrix
if errscalar is not None:
corr *= errscalar**2
# Scale covariance in accordance with global total error
# Prior error averaged over time steps
xerr = np.mean(np.reshape(cntrlv.std, (tracer.ndates,
int(len(cntrlv.std)/tracer.ndates))), axis=0)
# cov = np.outer(cntrlv.std, cntrlv.std) * corr
cov = np.outer(xerr, xerr) * corr
covsum = cov*np.outer(cntrlv.area_reg, cntrlv.area_reg)
toterr = np.sqrt(covsum.sum())*3600.*24.*365./1.e9/float(tracer.numscale)
errscalar = glob_err/toterr
verbose("Total error scaled by "+ str(errscalar))
verbose("covsum "+ str(covsum.sum()))
# ESO TODO: try scaling std instead
# corr = corr*errscalar**2
cntrlv.std = cntrlv.std*errscalar**2
# Component analysis
evalues, evectors = np.linalg.eigh(corr)
......@@ -147,7 +162,6 @@ def dump_hcorr(nlon, nlat, sigma_sea, sigma_land,
ncell = evalues.size
# TODO change file name for spatial regions
file_dump = '{}/horcor_{}x{}_cs{}_cl{}.bin'.format(
dir_dump, nlon, nlat, sigma_sea, sigma_land)
......@@ -179,7 +193,6 @@ def read_hcorr(nlon, nlat, sigma_sea, sigma_land, dir_dump, hresoldim=None):
"""
# TODO file name
file_dump = '{}/horcor_{}x{}_cs{}_cl{}.bin'.format(
dir_dump, nlon, nlat, sigma_sea, sigma_land)
......
......@@ -19,8 +19,6 @@ def init_bprod(cntrlv, options={}, **kwargs):
Returns:
updated control vector:
Todo:
ESO: Include total domain error to save/load routines
"""
......@@ -92,9 +90,6 @@ def init_bprod(cntrlv, options={}, **kwargs):
tracer.chi_hresoldim = sqrt_evalues.size
elif hasattr(tracer, 'hcorrelations') and tracer.hresol == 'regions':
# TODO don't need a separate block for regions, instead
# pass tracer.hresol/regions keyword to build_hcorr
# Scale by domain total error (given as Tg/y)
if hasattr(tracer, 'glob_err'):
glob_err = float(tracer.glob_err)
......@@ -102,23 +97,25 @@ def init_bprod(cntrlv, options={}, **kwargs):
raise Exception("glob_err must be >0")
# Get area of region boxes
# TODO: could compute elsewhere and store in tracer.domain
area_reg = np.squeeze(map2scale(
tracer.domain.areas[np.newaxis, np.newaxis,:,:],
tracer,
tracer.domain,
region_scale_area=False, region_max_val=False))
errsum = np.dot(cntrlv.std, np.tile(area_reg, tracer.ndates))/float(tracer.ndates)
toterr = errsum*3600.*24.*365./1.e9/float(tracer.numscale)
errscalar = glob_err/toterr
cntrlv.std = cntrlv.std*errscalar**2
verbose("Total error scaled by "+ str(errscalar))
verbose("errsum "+ str(errsum))
# TODO: rework this based on most recent flexinvert version,
# including using xerr (time avg error)
cntrlv.area_reg = area_reg
# New implementation done in build_hcorr.py
# Old implementation of glob_err, consistent with 2020 version of
# flexinvertplus:
# errsum = np.dot(cntrlv.std, np.tile(area_reg, tracer.ndates))/float(tracer.ndates)
# toterr = errsum*3600.*24.*365./1.e9/float(tracer.numscale)
# errscalar = glob_err/toterr
# cntrlv.std = cntrlv.std*errscalar**2
# verbose("Total error scaled by "+ str(errscalar))
# verbose("errsum "+ str(errsum))
corr = tracer.hcorrelations
......@@ -147,7 +144,7 @@ def init_bprod(cntrlv, options={}, **kwargs):
# or separated land and sea, along a land-sea mask
# Default is no separation
lsm = getattr(corr, 'landsea', False)
# import pdb; pdb.set_trace()
if lsm:
sigma_land = getattr(corr, 'sigma_land', -1)
sigma_sea = getattr(corr, 'sigma_sea', -1)
......@@ -178,7 +175,7 @@ def init_bprod(cntrlv, options={}, **kwargs):
dump=dump_hcorr, dir_dump=dircorrel,
projection=projection, regions=True,
hresoldim=tracer.hresoldim, tracer=tracer,
errscalar=errscalar, **kwargs)
cntrlv=cntrlv, glob_err=glob_err, **kwargs)
# Storing computed correlations for use by other components
cntrlv.hcorrelations[(sigma_land, sigma_sea)] = \
......@@ -241,7 +238,6 @@ def init_bprod(cntrlv, options={}, **kwargs):
tracer.chi_pointer = cntrlv.chi_dim
tracer.chi_dim = tracer.chi_hresoldim * tracer.chi_tresoldim
cntrlv.chi_dim += tracer.chi_dim
#import pdb; pdb.set_trace()
# Defining chi from the total dimension
cntrlv.chi = np.zeros((cntrlv.chi_dim,))
......
......@@ -152,6 +152,4 @@ def init_xb(cntrlv, **kwargs):
cntrlv.xb = np.append(cntrlv.xb, xb)
cntrlv.std = np.append(cntrlv.std, std)
#import pdb; pdb.set_trace()
return cntrlv
......@@ -7,9 +7,6 @@ def sqrtbprod(controlvect, chi, **kwargs):
"""
# ESO check dim
#import pdb; pdb.set_trace()
# Initializes output vector
xout = np.zeros(controlvect.dim)
......@@ -73,8 +70,6 @@ def sqrtbprod(controlvect, chi, **kwargs):
# Fill data for background scalars (if present)
if hasattr(controlvect, 'background'):
# xout[-controlvect.background.ncini::] = chi[-controlvect.background.ncini::]
# ESO: is this correct for multiple cini periods?
xout[-controlvect.cinidim::] = chi[-controlvect.cinidim::]
return xout * controlvect.std + controlvect.xb
......@@ -144,9 +139,7 @@ def sqrtbprod_ad(controlvect, dx, **kwargs):
# Fill data for background scalars (if present)
if hasattr(controlvect, 'background'):
chiout[-controlvect.background.ncini::] = dx[-controlvect.background.ncini::] * \
chiout[-controlvect.cinidim::] = dx[-controlvect.cinidim::] * \
controlvect.background.cini_err
return chiout
......@@ -83,4 +83,3 @@ def create_domain(domain,
domain.iy1 = iy1
domain.iy2 = iy2
# import pdb; pdb.set_trace()
......@@ -62,7 +62,7 @@ def read(self, name, tracdir, tracfic, dates,
flx_reg = data[:, iy0:iy0+self.domain.nlat, ix0:ix0+self.domain.nlon]
# converting to regular np array and appending
# Converting to regular np array and appending
np_flx = np.array(flx_reg)
if first:
......@@ -71,53 +71,12 @@ def read(self, name, tracdir, tracfic, dates,
else:
trcr_flx = np.append(trcr_flx, np_flx, axis=0)
# trcr_flx.append(flx_reg)
# nc = xr.open_dataset(
# '{}/{}'.format(tracdir, fic_flx),
# decode_times=False)
# nlon = self.domain.nlon
# nlat = self.domain.nlat
# Vector to map
# Deals with polar boxes by sub-dividing them zonally
# Also loops zonally for consistency with other call to gridded values
# flx = nc['flx_{}'.format(name.lower())].values
# flx0 = flx[:, 0]
# flx1 = flx[:, -1]
# flx = flx[:, 1:-1].reshape((-1, nlat - 2, nlon - 1))
# flx = np.append(flx, flx1[:, np.newaxis, np.newaxis]
# * np.ones((1, 1, nlon - 1)), axis=1)
# flx = np.append(flx0[:, np.newaxis, np.newaxis]
# * np.ones((1, 1, nlon - 1)), flx, axis=1)
# flx = np.append(flx, flx[:, :, np.newaxis, 0], axis=2)
# Keeps only values for the corresponding month
# Assumes monthly resolution
# if nc.dims['time'] == 12:
# month = dd.month
# flx = flx[month - 1]
# else:
# flx = flx[0]
# xmod = xr.DataArray(trcr_flx[0],
xmod = xr.DataArray(trcr_flx,
coords={'time': times},
dims=('time', 'lat', 'lon'))
# TODO: take care if several files are read
# TODO: scale flux contribution by area weight for boxes
# TODO: consider storing fluxes at original time resolution and
# interpolate as needed
flx = np.ndarray((self.ndates, self.domain.nlat, self.domain.nlon))
# Interpolate fluxes to start time of control period
......
......@@ -70,19 +70,12 @@ def read_glob(self, name, tracdir, tracfic, dates,
else:
trcr_flx = np.append(trcr_flx, np_flx, axis=0)
# trcr_flx.append(data[:, :, :])
xmod = xr.DataArray(trcr_flx,
coords={'time': times},
dims=('time', 'lat', 'lon'))
# TODO: take care if several files are read
# TODO: scale flux contribution by area weight for boxes
# TODO: consider storing fluxes at original time resolution and
# interpolate as needed
flx = np.ndarray((self.ndates, self.domain.nlat_glob, self.domain.nlon_glob))
# Interpolate fluxes to start time of control period
......
......@@ -67,9 +67,6 @@ def check_options(self, chi, finit,
check.verbose(towrite)
# ESO check dimensions
# import pdb; pdb.set_trace()
# Checking for inconsistent definition of parameters
if dim <= 0. or niter <= 0. or nsim <= 0. \
or dxmin <= 0. or epsg <= 0. or epsg > 1. \
......
......@@ -12,7 +12,6 @@ def ini_periods(self, **kwargs):
#self.subsimu_dates = \
# date_range(datei, datef, period=self.periods)
# TODO: implement sub-periods? For now just keep the full period
self.subsimu_dates = \
date_range(datei, datef, period='')
......
......@@ -21,9 +21,6 @@ def execute(self, **kwargs):
# Control vector
controlvect = self.controlvect
# ESO check dim
#import pdb; pdb.set_trace()
# Observation operator
obsoper = self.obsoperator
......
......@@ -110,7 +110,6 @@ def obsoper(self, inputs, mode,
if tracer.hresol == 'regions':
nbox = tracer.nregions
# TODO: this will change once initial conditions are added (ciniopt)
npvar = tracer.ndates*nbox
ndvar = nbox
nvar = npvar
......@@ -177,7 +176,7 @@ def obsoper(self, inputs, mode,
# If file is not found, continue
if not os.path.isfile(os.path.join(runsubdir_nest, file_name)):
verbose("Warning: file not found, ", os.path.join(runsubdir_nest, file_name))
verbose("WARNING: file not found: " + os.path.join(runsubdir_nest, file_name))
continue
grid_nest, gtime, ngrid = model.utils.read.read_flexpart_grid(
......@@ -279,12 +278,16 @@ def obsoper(self, inputs, mode,
ncini = obsvect.background.ncini
# Time step of initial concentration optimization; ni
mask = row.Index - datetime.timedelta(days=trajdays) - controlvect.background.cinitime[:] < datetime.timedelta(0)
mask = row.Index - datetime.timedelta(days=trajdays) - controlvect.background.cinitime[:] - datetime.timedelta(days=controlvect.background.cini_res) < datetime.timedelta(0)
ni = int(np.argmax(mask))
# Force last time step if last cini period (mask is all False)
if all(~mask):
ni=controlvect.background.ntcini - 1
obs_cinipos[obs_i] = np.dot(cini[:],
controlvect.x[npvar+ni*ncini:npvar+(ni+1)*ncini])
if getattr(tracer, 'offsets', False):
# Optimize offsets
obs_sim[obs_i] = obs_model[obs_i] + obs_ghg[obs_i] + obs_bkg[obs_i] + obs_cinipos[obs_i]
......@@ -400,11 +403,17 @@ def obsoper(self, inputs, mode,
path.init_dir(rundir)
dump_type = obsvect.dump_type
# ESO TODO: add the cini_{} columns for debugging
col2dump = ['obs_ghg', 'obs_bkg', 'obs_model', 'obs_sim', 'obs_check',
'obs_bkgerr', 'obs_err', 'obs_hx']
if hasattr(obsvect, 'background'):
col2dump.append('obs_cinipos')
col2dump += ["obs_cini_{}".format(i) for i in range(obsvect.background.ncini)]
if dump_debug:
sort_order = getattr(obsvect, 'sort_order', ['index', 'station'])
......
......@@ -6,6 +6,7 @@ import pandas as pd
import os
import numpy as np
import pycif.utils.check as check
import sys
from .headers import get_header, parse_header
from . import utils
......@@ -13,33 +14,28 @@ from . import utils
def do_parse(self,
obs_file,
maxlen=2,
maxlen=1,
default_unit='ppm',
default_tz='utc',
default_duration=1,
na_values=-999.999,
default_provider='flexinvert',
# err_na_values=0.0000,
spec=None,
# extract=['obserror', 'unit',
# 'lon', 'lat', 'alt', 'tz'],
# extract=['rec', 'yyyymmdd',
# 'hhmmss', 'conc', 'err'],
extract=['rec', 'conc', 'err'],
**kwargs):
"""Parse function for a file from WDCGG
"""Parse function for a file from flexinvertplus
Args:
obs_file (str) :
Path to input file
maxlen (int):
Maximum possible length for a WDCGG header. Default is `300`
Maximum possible length for header, default is `1`
default_unit (str):
Default unit to use for converting the processed species to ppm if
not explicitly specified in the WDCGG file. Default is ppm
not explicitly specified in the file. Default is ppm
default_tz (str):
Default time zone to shift observations to UTC time if not
explicitly specified in the WDCGG file. Default is utc
explicitly specified in the file. Default is utc
default_duration (str):
Default duration of measurements in hours. Default is 1.
na_values (str):
......@@ -76,6 +72,13 @@ def do_parse(self,
# Get default na_values from species description if available
if hasattr(self, 'err_na_values'):
err_na_values = self.err_na_values
# Get file containing location for all stations
if hasattr(self, 'station_detail'):
station_detail = self.station_detail
else:
raise Exception('Error: for FLEXINVERTPLUS observation file format, '
'"station_detail" must be given in the config file')
# Scans file to get the header
header = get_header(obs_file, maxlen)
......@@ -90,19 +93,13 @@ def do_parse(self,
# or from the file name
file_infos = utils.parse_file(obs_file)
# Read station info file
stn_info = utils.read_stnlist(station_detail)
if spec is None:
spec = 'n2o'
# list_extract = [spec] + extract
# Read above-ground or above-sea height
# if getattr(self, "above_sea", False):
# extract[extract.index('intake_height')] = 'altitude'
# hscale = 'altitude'
# else:
# hscale = 'intake_height'
list_extract = extract
# Get the content of columns and extra information from the header
......@@ -128,10 +125,7 @@ def do_parse(self,
# na_values=na_values
)
# import pdb; pdb.set_trace()
# Add leading zeros to time columns
#df[2][df[2] == "0"] = "000000"
df[2] = df[2].str.zfill(6)
# rename to time
......@@ -147,8 +141,6 @@ def do_parse(self,
extra['tz'] = default_tz
extra['unit'] = default_unit
#extra['units']
#del(extra['units'])
# Convert to datetime format
df['time'] = pd.to_datetime(df['time'], format="%Y%m%d%H%M%S")
......@@ -199,10 +191,21 @@ def do_parse(self,
for e in extra:
df[e] = extra[e]
df['lat'] = header[0].split()[1]
df['lon'] = header[0].split()[2]
df['alt'] = header[0].split()[3]
# Get lon, lat, alt from stations info file
try:
lat, lon, alt = stn_info[file_infos['stat'].lower()]
except Exception:
print("""
Error in obsparser for FLEXINVERTPLUS:
Probably you need to specify coordinates for station {} in file\n\t{}
""".format(file_infos['stat'], station_detail))
sys.exit(0)
df['lat'], df['lon'], df['alt'] = lat, lon, alt
# df['lat'] = header[0].split()[1]
# df['lon'] = header[0].split()[2]
# df['alt'] = header[0].split()[3]
file_infos['provider'] = default_provider
df['station'] = file_infos['stat']
......@@ -211,19 +214,15 @@ def do_parse(self,
df['network'] = file_infos['provider']
df['parameter'] = spec.lower()
df['duration'] = default_duration
# df.rename(columns={spec.lower(): 'obs'}, inplace=True)
df.rename(columns={'conc': 'obs'}, inplace=True)
# df.rename(columns={'value_unc': 'obserror'}, inplace=True)
df.rename(columns={'err': 'obserror'}, inplace=True)
# df.rename(columns={hscale: 'alt'}, inplace=True)
# df.rename(columns={'latitude': 'lat'}, inplace=True)
# df.rename(columns={'longitude': 'lon'}, inplace=True)
df.drop(columns='rec', inplace=True)
df['obs'] = pd.to_numeric(df['obs'])
df['lat'] = pd.to_numeric(df['lat'])
df['lon'] = pd.to_numeric(df['lon'])
df['alt'] = pd.to_numeric(df['alt'])
# If the parameter column is not a float, putting nans
if df['obs'].dtype == 'object':
......@@ -233,9 +232,6 @@ def do_parse(self,
if kwargs.get('remove_nan', True):
df = df[~np.isnan(df['obs'])]
df = df[df['obs'] > na_values]
# ESO removing the condition below, there are troubles with some VERIFY
# format files. If reported obserror is invalid just set to minerr.
#df = df[df['obserror'] > err_na_values]
# Rescales if needed
if kwargs.get('rescale', False):
......
......@@ -28,7 +28,7 @@ def get_header(obs_file, maxlen):
with open(obs_file, "r") as input_file:
lines = []
nheader = 2
nheader = 1
for line in input_file:
lines.append(line.strip())
......@@ -72,7 +72,6 @@ def parse_header(header, spec, list_extract,
"""
# Minimize all characters to facilitate comparisons
#head = [s.lower() for s in header[-1].split()[1:]]
head = [s.lower() for s in header[-1].split()]
# Parsing time information
......@@ -115,15 +114,6 @@ def parse_header(header, spec, list_extract,
# If cannot find,
# assume default values for unit and timezone
check.verbose("Cant extract " + id_extract)
# import pdb; pdb.set_trace()
# ESO look at this
# if id_extract == 'units':
# extra[id_extract] = default_unit
# elif id_extract == 'time':
# extra[id_extract] = default_tz
return names, columns, date_ids, extra
......@@ -48,6 +48,27 @@ def find_header(id_extract, header):
raise ValueError("Couldn't extract {}".format(id_extract))
def read_stnlist(station_detail):
"""Read station list file.
Return: Dictionary with station as key, and list of lat, lon, alt as value
TODO: this function only needs to be called once, not for every station file read
"""
with open(station_detail) as f:
stn_txt = f.readlines()
stn_dict = {}
for line in stn_txt:
stn_dict.update({line.split()[0].lower() : line.split()[1::]})
return stn_dict
# ESO TODO: call from init, update lat lon alt form here instead of from header
def rescale(obs_file, header):
"""Finds out on what scale the measurement was reported and returns the
corresponding scaling factor.
......@@ -90,10 +111,10 @@ def rescale(obs_file, header):
def parse_file(obs_file):
"""Parses VERIFY file name and extract corresponding information.
"""Parses FLEXINVERT file name and extract corresponding information.
"""
filesplit = os.path.basename(obs_file).split('.')
infos = {}
......
......@@ -28,19 +28,19 @@ def do_parse(self,
# extract=['value', 'value_unc',
# 'longitude', 'latitude', 'intake_height'], #, 'tz'],
**kwargs):
"""Parse function for a file from WDCGG
"""Parse function for a file in VERIFY format