Commit c3745e41 authored by Espen Sollum's avatar Espen Sollum
Browse files

Added some (not complete) obsparsers for verify and flexinvertplus formats

parent 16da8766
......@@ -50,7 +50,8 @@ def read_conc(self, model, dir_initconc, file_initconc, **kwargs):
if datetime.strptime(file_conc, file_initconc) in dates_available:
data, prs, lat, lon = readnc(os.path.join(dir_initconc, file_conc),
[self.varname_conc, 'pressure', 'latitude', 'longitude'])
[self.varname_conc, self.varname_pressure,
'latitude', 'longitude'])
# Convert to center coordinates
lat = lat + (lat[1] - lat[0])/2.
......@@ -92,6 +93,7 @@ def read_conc(self, model, dir_initconc, file_initconc, **kwargs):
conc_all = xmod_ver
else:
conc_all = xr.concat([conc_all, xmod_ver], 'time')
# import pdb; pdb.set_trace()
return conc_all
......@@ -65,6 +65,8 @@ def background_xb(cntrlv, **kwargs):
cntrlv.chi = np.append(cntrlv.chi, np.zeros(cinidim))
cntrlv.chi_dim += cinidim
cntrlv.cinidim = cinidim
# Finished initializing controlvect
cntrlv.background.updated_controlvect = True
......
......@@ -82,7 +82,7 @@ def dump_tcorr(period, dates, sigma_t,
evalues, evectors, dir_dump):
"""Dumps eigenvalues and vectors to a bin file. The default file format
is:
'{}/tempcor_{}_{}_per{}_ct{}_lmdz.bin'.format(
'{}/tempcor_{}_{}_per{}_ct{}.bin'.format(
dir_dump, datei, datef, period, sigma_t)
Args:
......@@ -96,7 +96,7 @@ def dump_tcorr(period, dates, sigma_t,
datei = dates[0]
datef = dates[-1]
file_dump = '{}/tempcor_{}_{}_per{}_ct{}_lmdz.bin'.format(
file_dump = '{}/tempcor_{}_{}_per{}_ct{}.bin'.format(
dir_dump, datei.strftime("%Y%m%d%H%M"), datef.strftime("%Y%m%d%H%M"),
period, sigma_t)
......@@ -123,7 +123,7 @@ def read_tcorr(period, dates, sigma_t, dir_dump):
datei = dates[0]
datef = dates[-1]
file_dump = '{}/tempcor_{}_{}_per{}_ct{}_lmdz.bin'.format(
file_dump = '{}/tempcor_{}_{}_per{}_ct{}.bin'.format(
dir_dump, datei.strftime("%Y%m%d%H%M"), datef.strftime("%Y%m%d%H%M"),
period, sigma_t)
......
......@@ -86,6 +86,7 @@ def dump(self, cntrl_file, to_netcdf=False, dir_netcdf=None, run_id=None,
if getattr(tracer, 'offsets', False):
offsets = True
x[:,0,:,:] = (x[:,0,:,:] + self.flxall)/np.float(self.model.numscale)
else:
print("Offsets=False")
offsets = False
......@@ -97,6 +98,8 @@ def dump(self, cntrl_file, to_netcdf=False, dir_netcdf=None, run_id=None,
x.values[xb_grid.values > smallnum] = x.values[xb_grid.values > smallnum]*scaling_factor.values[xb_grid.values > smallnum]
x[:,:,:,:] /= np.float(self.model.numscale)
# subtract initial guess for increment
xa[:,0,:,:] -= self.flxall/np.float(self.model.numscale)
std = np.reshape(
self.std[tracer.xpointer:tracer.xpointer + tracer.dim],
......
......@@ -112,7 +112,7 @@ def init_bprod(cntrlv, options={}, **kwargs):
errsum = np.dot(cntrlv.std, np.tile(area_reg, tracer.ndates))/float(tracer.ndates)
toterr = errsum*3600.*24.*365./1.e9/float(tracer.numscale)
errscalar = glob_err/toterr
#cntrlv.std = cntrlv.std*errscalar**2
cntrlv.std = cntrlv.std*errscalar**2
verbose("Total error scaled by "+ str(errscalar))
verbose("errsum "+ str(errsum))
# TODO: rework this based on most recent flexinvert version,
......@@ -241,6 +241,7 @@ def init_bprod(cntrlv, options={}, **kwargs):
tracer.chi_pointer = cntrlv.chi_dim
tracer.chi_dim = tracer.chi_hresoldim * tracer.chi_tresoldim
cntrlv.chi_dim += tracer.chi_dim
#import pdb; pdb.set_trace()
# Defining chi from the total dimension
cntrlv.chi = np.zeros((cntrlv.chi_dim,))
......
......@@ -152,4 +152,6 @@ def init_xb(cntrlv, **kwargs):
cntrlv.xb = np.append(cntrlv.xb, xb)
cntrlv.std = np.append(cntrlv.std, std)
#import pdb; pdb.set_trace()
return cntrlv
......@@ -6,6 +6,9 @@ def sqrtbprod(controlvect, chi, **kwargs):
"""
# ESO check dim
#import pdb; pdb.set_trace()
# Initializes output vector
xout = np.zeros(controlvect.dim)
......@@ -70,7 +73,9 @@ def sqrtbprod(controlvect, chi, **kwargs):
# Fill data for background scalars (if present)
if hasattr(controlvect, 'background'):
xout[-controlvect.background.ncini::] = chi[-controlvect.background.ncini::]
# xout[-controlvect.background.ncini::] = chi[-controlvect.background.ncini::]
# ESO: is this correct for multiple cini periods?
xout[-controlvect.cinidim::] = chi[-controlvect.cinidim::]
return xout * controlvect.std + controlvect.xb
......
......@@ -39,7 +39,8 @@ def read(self, name, tracdir, tracfic, dates,
list_fic_flx = np.unique([dd.strftime(tracfic) for dd in dates])
# Reading fluxes for periods within the simulation window
trcr_flx = []
# trcr_flx = np.zeros(0, self.domain.nlat, self.domain.nlon)
first = True
times = []
# for dd, fic_flx in zip(dates, list_fic_flx):
for file_flx in list(list_fic_flx):
......@@ -61,8 +62,16 @@ def read(self, name, tracdir, tracfic, dates,
flx_reg = data[:, iy0:iy0+self.domain.nlat, ix0:ix0+self.domain.nlon]
# converting to regular np array and appending
np_flx = np.array(flx_reg)
trcr_flx.append(flx_reg)
if first:
trcr_flx = np_flx
first = False
else:
trcr_flx = np.append(trcr_flx, np_flx, axis=0)
# trcr_flx.append(flx_reg)
# nc = xr.open_dataset(
......@@ -97,7 +106,8 @@ def read(self, name, tracdir, tracfic, dates,
# flx = flx[0]
xmod = xr.DataArray(trcr_flx[0],
# xmod = xr.DataArray(trcr_flx[0],
xmod = xr.DataArray(trcr_flx,
coords={'time': times},
dims=('time', 'lat', 'lon'))
......
......@@ -46,7 +46,8 @@ def read_glob(self, name, tracdir, tracfic, dates,
trcr_flx = []
times = []
# for dd, fic_flx in zip(dates, list_fic_flx):
trcr_flx = []
# trcr_flx = []
first = True
for file_flx in list(list_fic_flx):
data, lat, lon, time_jd = readnc(os.path.join(tracdir, file_flx),
[self.varname_flx, self.latname_flx,
......@@ -59,10 +60,19 @@ def read_glob(self, name, tracdir, tracfic, dates,
# Convert to ng/m2/s
numscale = np.float(getattr(self, 'numscale', 1.E12) )
data *= numscale/3600.
# Converting to regular np array and appending
np_flx = np.array(data)
if first:
trcr_flx = np_flx
first = False
else:
trcr_flx = np.append(trcr_flx, np_flx, axis=0)
trcr_flx.append(data[:, :, :])
# trcr_flx.append(data[:, :, :])
xmod = xr.DataArray(trcr_flx[0],
xmod = xr.DataArray(trcr_flx,
coords={'time': times},
dims=('time', 'lat', 'lon'))
......
......@@ -66,6 +66,9 @@ def check_options(self, chi, finit,
""".format(dim, m, dxmin, df1, epsg, niter, nsim)
check.verbose(towrite)
# ESO check dimensions
# import pdb; pdb.set_trace()
# Checking for inconsistent definition of parameters
if dim <= 0. or niter <= 0. or nsim <= 0. \
......
......@@ -43,7 +43,8 @@ subroutine read_header(filename, numpoint, ibdate, ibtime, releases, &
character(len=256) :: flexversion
! character(len=256) :: flexversion
character(len=29) :: flexversion
logical :: lexist
integer :: nageclass, imethod, nspec
integer :: loutstep, loutaver, loutsample
......
......@@ -21,6 +21,9 @@ def execute(self, **kwargs):
# Control vector
controlvect = self.controlvect
# ESO check dim
#import pdb; pdb.set_trace()
# Observation operator
obsoper = self.obsoperator
......
......@@ -177,6 +177,7 @@ def obsoper(self, inputs, mode,
# If file is not found, continue
if not os.path.isfile(os.path.join(runsubdir_nest, file_name)):
verbose("Warning: file not found, ", os.path.join(runsubdir_nest, file_name))
continue
grid_nest, gtime, ngrid = model.utils.read.read_flexpart_grid(
......@@ -189,6 +190,7 @@ def obsoper(self, inputs, mode,
file_name = 'grid_time_' + file_date + '_001'
if not os.path.isfile(os.path.join(runsubdir_glob, file_name)):
verbose("Warning: file not found, ", os.path.join(runsubdir_glob, file_name))
continue
grid_glob, gtime_glob, ngrid_glob = model.utils.read.read_flexpart_grid(
......
# -*- coding: utf-8 -*-
from builtins import map
from builtins import zip
from builtins import range
import pandas as pd
import os
import numpy as np
import pycif.utils.check as check
from .headers import get_header, parse_header
from . import utils
def do_parse(self,
obs_file,
maxlen=2,
default_unit='ppm',
default_tz='utc',
default_duration=1,
na_values=-999.999,
default_provider='flexinvert',
# err_na_values=0.0000,
spec=None,
# extract=['obserror', 'unit',
# 'lon', 'lat', 'alt', 'tz'],
# extract=['rec', 'yyyymmdd',
# 'hhmmss', 'conc', 'err'],
extract=['rec', 'conc', 'err'],
**kwargs):
"""Parse function for a file from WDCGG
Args:
obs_file (str) :
Path to input file
maxlen (int):
Maximum possible length for a WDCGG header. Default is `300`
default_unit (str):
Default unit to use for converting the processed species to ppm if
not explicitly specified in the WDCGG file. Default is ppm
default_tz (str):
Default time zone to shift observations to UTC time if not
explicitly specified in the WDCGG file. Default is utc
default_duration (str):
Default duration of measurements in hours. Default is 1.
na_values (str):
Pattern for NaN values. Default is -999
spec (str, optional) :
Species to extract. Default is None.
If `MCF` is specified, it is translated to `CH3CCl3`.
extract (list[str], optional):
List of parameters to extract.
Default is `[error, lon, lat, alt, tz]`
logfile (str, optional) :
File name for verbose entries
Returns:
pandas.DataFrame :
Dataframe from input file df[parameter][station]
"""
check.verbose("Reading observation file: {}".format(os.path.basename(obs_file)))
# Get default unit from species description if available
if hasattr(self, 'default_unit'):
default_unit = self.default_unit
# Get default duration from species description if available
if hasattr(self, 'default_duration'):
default_duration = self.default_duration
# Get default na_values from species description if available
if hasattr(self, 'na_values'):
na_values = self.na_values
# Get default na_values from species description if available
if hasattr(self, 'err_na_values'):
err_na_values = self.err_na_values
# Scans file to get the header
header = get_header(obs_file, maxlen)
# Does not read empty files
if len(header) == 0:
check.verbose("{} is empty. Not reading it".format(obs_file))
return pd.DataFrame({})
else:
# Get spec info either from the function argument
# or from the file name
file_infos = utils.parse_file(obs_file)
if spec is None:
spec = 'n2o'
# list_extract = [spec] + extract
# Read above-ground or above-sea height
# if getattr(self, "above_sea", False):
# extract[extract.index('intake_height')] = 'altitude'
# hscale = 'altitude'
# else:
# hscale = 'intake_height'
list_extract = extract
# Get the content of columns and extra information from the header
names, columns, date_ids, extra = \
parse_header(header,
spec,
list_extract,
default_unit,
default_tz)
# Reads the file with Pandas
df = pd.read_csv(obs_file,
skiprows=len(header),
usecols=date_ids + columns,
# parse_dates=[list(range(len(date_ids)))],
parse_dates=[date_ids],
infer_datetime_format=True,
quoting=3,
header=None,
keep_date_col=True,
# sep=" "#,
delim_whitespace=True
# na_values=na_values
)
# import pdb; pdb.set_trace()
# Add leading zeros to time columns
#df[2][df[2] == "0"] = "000000"
df[2] = df[2].str.zfill(6)
# rename to time
df['time'] = df[1] + df[2]
# Rename columns according to standard names
df.rename(columns=dict(list(zip(columns, names))), inplace=True)
#df.rename(columns={'_'.join(map(str, date_ids)): 'time'}, inplace=True)
df.drop(columns='_'.join(map(str, date_ids)), inplace=True)
df.drop(columns=date_ids, inplace=True)
# extra['tz'] = extra['time']
# del(extra['time'])
extra['tz'] = default_tz
extra['unit'] = default_unit
#extra['units']
#del(extra['units'])
# Convert to datetime format
df['time'] = pd.to_datetime(df['time'], format="%Y%m%d%H%M%S")
# Set the data frame index as time
df.set_index('time', inplace=True)
# Deals with problems in reading the dates
# Some hour values at 99 and 24
if not isinstance(df.index, pd.DatetimeIndex):
index = df.index.values.astype('str')
# Removes hours > 24 and minutes > 60
hours = \
np.array(
[ln.split(':')[0].split(' ')[-1] for ln in index]).astype(int)
df = df[hours <= 24]
index = index[hours <= 24]
minutes = \
np.array(
[ln.split(':')[1] for ln in index]).astype(int)
df = df[minutes <= 60]
index = index[minutes <= 60]
# Shift hours at 24 by 1 hour back in time to be compatible for
# reading
# then forward
mask24 = np.core.defchararray.find(index, '24:') > 0
index = np.core.defchararray.replace(index, '24:', '23:')
mask60 = np.core.defchararray.find(index, ':60') > 0
index = np.core.defchararray.replace(index, ':60', ':00')
index = pd.to_datetime(pd.Series(index))
index[mask24] += pd.DateOffset(hours=1)
index[mask60] += pd.DateOffset(hours=1)
df.index = index
# Shifting dates depending on time zone, then removing corresponding key
df.index = utils.shiftdate(df.index, extra['tz'])
del extra['tz']
# Fill extra columns with the same value everywhere
# e.g., for coordinates when only specified in the header
for e in extra:
df[e] = extra[e]
df['lat'] = header[0].split()[1]
df['lon'] = header[0].split()[2]
df['alt'] = header[0].split()[3]
file_infos['provider'] = default_provider
df['station'] = file_infos['stat']
df['station'] = file_infos['stat']
df['network'] = file_infos['provider']
df['parameter'] = spec.lower()
df['duration'] = default_duration
# df.rename(columns={spec.lower(): 'obs'}, inplace=True)
df.rename(columns={'conc': 'obs'}, inplace=True)
# df.rename(columns={'value_unc': 'obserror'}, inplace=True)
df.rename(columns={'err': 'obserror'}, inplace=True)
# df.rename(columns={hscale: 'alt'}, inplace=True)
# df.rename(columns={'latitude': 'lat'}, inplace=True)
# df.rename(columns={'longitude': 'lon'}, inplace=True)
df.drop(columns='rec', inplace=True)
df['obs'] = pd.to_numeric(df['obs'])
df['lat'] = pd.to_numeric(df['lat'])
df['lon'] = pd.to_numeric(df['lon'])
# If the parameter column is not a float, putting nans
if df['obs'].dtype == 'object':
df['obs'] = np.nan
# Removing rows with NaNs
if kwargs.get('remove_nan', True):
df = df[~np.isnan(df['obs'])]
df = df[df['obs'] > na_values]
# ESO removing the condition below, there are troubles with some VERIFY
# format files. If reported obserror is invalid just set to minerr.
#df = df[df['obserror'] > err_na_values]
# Rescales if needed
if kwargs.get('rescale', False):
# coeffscale = rescale(obs_file, header)
coeffscale = utils.rescale(obs_file, header)
if np.isnan(coeffscale):
check.verbose("Unknown scale, please check with provider")
df['obs'] *= coeffscale
df['obserror'] *= coeffscale
# Converts unit
df = utils.convert_unit(df, ['obs', 'obserror'],
default_unit=default_unit)
# Set a minimum measurement error to override reported value
if hasattr(self, 'measerr'):
# df['obserror'][df['obserror'] < self.measerr] = self.measerr
df.loc[df['obserror'] < self.measerr, 'obserror'] = self.measerr
# Set a minimum measurement error if nan
df['obserror'] = df['obserror'].fillna(self.measerr)
return df
# -*- coding: utf-8 -*-
from __future__ import print_function
from __future__ import absolute_import
from .utils import remap_extract, find_header
import os
import glob
import pycif.utils.check as check
def get_header(obs_file, maxlen):
"""Extract the header from a FLEXINVERTPLUS format observation file
Args:
obs_file (str): path to input file
maxlen (int): abort after this amount of lines when reading header.
Default 2
Returns:
List[str]: List with all Lines of the Header
"""
# return empty if not a file
if not os.path.isfile(obs_file):
return []
with open(obs_file, "r") as input_file:
lines = []
nheader = 2
for line in input_file:
lines.append(line.strip())
if len(lines) > maxlen:
break
if len(lines) > nheader:
break
if not lines:
return []
# if the number of line was not found, tries to find it
# on the first line of the document
return lines[:nheader]
def parse_header(header, spec, list_extract,
default_unit='ppm',
default_tz='utc'):
"""Extract information from the header
Args:
header (list[str]): extracted header
spec (str): species to extract
list_extract (list[str]): list of parameters to return
'flag' to extract flag
'error' to extract observation error
any other parameter appearing in the columns
default_unit (str): default unit generally used to report this species
default_tz (str): default time zone for this file
Returns:
a 4-element tuple containing
- names (list[str]): list of columns names to extract
- columns (list[int]): list of column index to extract
- date_ids (list[int]): list of column ids for date information
- extra (dict): extra information contained in the header and not
in the body of the file, e.g., altitude, coordinates, unit, etc.
"""
# Minimize all characters to facilitate comparisons
#head = [s.lower() for s in header[-1].split()[1:]]
head = [s.lower() for s in header[-1].split()]
# Parsing time information
try:
date_ids = [head.index('yyyymmdd'), head.index('hhmmss')]
except:
print(header)
print(head)
raise ValueError("Cant find a date in this VERIFY file. " \
"Please check format")
# Getting other parameters using the utils.remap_extract function
columns = []
names = []
extra = {}