Compare revisions

Islen Vallejo · Islen Vallejo · 26dcca8e · 26dcca8e · 26dcca8e
--- a/denoising/__main__.py
+++ b/denoising/__main__.py
+import matplotlib.pyplot as plt
 import numpy as np
 from pathlib import Path
 import sys
+from scipy.signal import savgol_filter
+import copy
+import plotly.graph_objects as go
+import json
+
+from scipy.interpolate import PchipInterpolator

 from denoising import (dataset, outliers)

@@ -8,6 +15,7 @@ from denoising import (dataset, outliers)
 def main():
    # Settings
    srcdir = Path('/home/wrfadmin/PycharmProjects/hapads/denoising/data')
+    dstdir = Path('/home/wrfadmin/test')
    sensor_data_1_txt = srcdir.joinpath('sensor_data.txt')
    sensor_data_2_txt = srcdir.joinpath('sensor_data_dataset_2.txt')
    reference_data_1_txt = srcdir.joinpath('weather_station_data.txt')
@@ -30,37 +38,157 @@ def main():
    # detection, the suggested value of L is close to 0.
    L = -0.1

-
-
    wname = 'haar'
-    plot = 0
-    comment = 1
-
-
-
-

    # Load datasets
    reference_data = dataset.from_csv(paths=[reference_data_1_txt, reference_data_2_txt])
+
    # Apply offset to reference data
    reference_data = dataset.apply_offset(reference_data, refence_offset)

    sensor_data = dataset.from_csv(paths=[sensor_data_1_txt, sensor_data_2_txt])

+    scor = sensor_data[['pm10', 'pm1', 'pm2_5']].values
+
+    # # OK
+    # dataset.check_values(scor)
+    scor2 = copy.deepcopy(scor)
+
+
    # Clean sensor data using threshold
-    scor2 = outliers.apply_threshold(sensor_data[['pm10', 'pm1', 'pm2_5']].values, threshold=thd)
+    scor2 = outliers.apply_threshold(scor2, threshold=thd)

    ####################################################################################################################
    ####################################################################################################################
    ####################################################################################################################
    # Set minumum target PM value based on cumulative data
    reftst = scor2.min(axis=1)
-    scor3 = outliers.apply_delta_relative_threshold(dataset=scor2[:, -1], reftst=reftst, threshold=thd2)
+
+    # TODO: Not exactly how it is done in the notebook. Lets check that this is the same as scor3 = deepcopy(scor2[:, -1]) !!!!!
+    scor3 = copy.deepcopy(scor2)
+    scor3 = outliers.apply_delta_relative_threshold(dataset=scor3[:, -1], reftst=reftst, threshold=thd2)
+
+
    ####################################################################################################################
    ####################################################################################################################
    ####################################################################################################################
    signal = np.expand_dims(scor3, axis=0)
-    test = outliers.apply_wavelets_despiking(signal, sfr, wid, L, wname)
+    # OK !!!
+    tE = outliers.get_wavelet_spikes(signal, sfr, wid, L, wname)
+
+
+    scor4 = copy.deepcopy(scor3)
+
+    for k in range(len(tE)):
+        scor4[tE[k]] = (np.array([scor4[tE[k] - 1], scor4[tE[k] + 1]])).mean()
+
+
+    # dataset.check_values(scor4)
+
+    # Interpolate reference at sensor time
+    reference_interpolator = PchipInterpolator(x=reference_data.values[:, 0], y=reference_data.values[:, 5])
+    refint = reference_interpolator(sensor_data.values[:, 0])
+
+    # dataset.check_values(refint)
+
+    # NOTE: Call this scor5 to avoid overwritting scor4 for control porposes
+    scor5 = savgol_filter(x=scor4,
+                          window_length=5,  # window size used for filtering
+                          polyorder=2)  # order of fitted polynomial
+
+    # dataset.check_values(scor5)
+
+
+    corr_matrix = np.corrcoef(refint, scor5)
+    corr = corr_matrix[0, 1]
+    R_sq = corr ** 2
+
+    # How well does the reint match the observations scor5
+    print(f'Coefficient of determination: {R_sq}')
+
+    rmse5 = dataset.rmse(refint, scor5)
+    rmse3 = dataset.rmse(refint, scor3)
+    rmse2 = dataset.rmse(refint, scor2[:, 2])
+    rmse1 = dataset.rmse(refint, sensor_data.values[:, 5])
+
+    print(f'RMSE - scor5: {rmse5}')
+    print(f'RMSE - scor3: {rmse3}')
+    print(f'RMSE - scor2: {rmse2}')
+    print(f'RMSE - scor1: {rmse1}')
+
+    # Display results
+    plt.figure(figsize=(30, 8))
+    plt.plot(sensor_data.values[:, 0][0:1140], reftst[0:1140], 'blue', linewidth=2)
+    plt.plot(sensor_data.values[:, 0][0:1140], scor3[0:1140], 'r', linewidth=0.5)
+    plt.plot(sensor_data.values[tE, 0][0:48], scor3[tE][0:48], 'ro', mfc='none')
+    plt.plot(sensor_data.values[:, 0][0:1140], scor4[0:1140], 'c')
+    plt.xlabel('Time')
+    plt.ylabel('Concentration')
+    plt.savefig(dstdir.joinpath('figure_1.png'))
+    plt.show()
+
+    # f1 = go.Figure(
+    #     data=[
+    #         go.Scatter(x=sensor_data.values[:, 0][0:1140], y=reftst[0:1140], name="reftst"),
+    #         go.Scatter(x=sensor_data.values[:, 0][0:1140], y=scor3[0:1140], name="scor3"),
+    #         go.Scatter(x=sensor_data.values[tE, 0][0:48], y=scor3[tE][0:48], name="scor3-peaks", mode='markers', marker=dict(color='red')),
+    #         go.Scatter(x=sensor_data.values[:, 0][0:1140], y=scor4[0:1140], name="scor3")
+    #     ],
+    #     layout={"xaxis": {"title": "Time"}, "yaxis": {"title": "Concentration"}, "title": ""}
+    # )
+    # f1.show()
+
+    plt.figure(figsize=(30, 8))
+    plt.plot(sensor_data.values[:, 0][0:1140], scor2[:, 0][0:1140], 'k', linewidth=2)
+    plt.plot(sensor_data.values[:, 0][0:1140], scor2[:, 1][0:1140], 'cyan', linewidth=1)
+    plt.plot(sensor_data.values[:, 0][0:1140], scor3[0:1140], 'b', linewidth=0.5)
+    plt.plot(sensor_data.values[:, 0][0:1140], scor2[:, 2][0:1140], 'r')
+    plt.xlabel('Time')
+    plt.ylabel('Concentration')
+    plt.savefig(dstdir.joinpath('figure_2.png'))
+    plt.show()
+
+
+
+    plt.figure(figsize=(30, 8))
+    plt.plot(sensor_data.values[:, 0][0:1140], scor3[0:1140], 'b', linewidth=2)
+    plt.plot(sensor_data.values[:, 0][0:1140], scor5[0:1140], 'g', linewidth=2)
+    plt.plot(sensor_data.values[:, 0][0:1140], refint[0:1140], 'r')
+    plt.xlabel('Time')
+    plt.ylabel('Concentration')
+    plt.savefig(dstdir.joinpath('figure_3.png'))
+    plt.show()
+
+    plt.figure(figsize=(15, 15))
+    plt.plot(refint, scor5, 'o')
+    plt.xlim([0, refint.max()])
+    plt.ylim([0, scor5.max()])
+    plt.xlabel('Interpolated')
+    plt.ylabel('Despiked')
+    plt.title(f'Coefficient of determination R²: {R_sq}')
+    plt.savefig(dstdir.joinpath('figure_4.png'))
+    plt.show()
+
+    # Save corrected results
+    tstamp = copy.deepcopy(sensor_data.values[:, 0])
+    xs = [sensor_data.values[:, 8], sensor_data.values[:, 1], sensor_data.values[:, 2]/100.0, scor5]
+
+    xr = np.zeros((len(tstamp), 4))
+    xr[:, 0] = PchipInterpolator(reference_data.values[:, 0], reference_data.values[:, 7])(tstamp)
+    xr[:, 1] = PchipInterpolator(reference_data.values[:, 0], reference_data.values[:, 1])(tstamp)
+    xr[:, 2] = PchipInterpolator(reference_data.values[:, 0], reference_data.values[:, 2])(tstamp)
+    xr[:, 3] = refint
+
+    scor1 = sensor_data.values[:, 3:5]
+
+    res = dict(xs=xs, xr=xr, scor1=scor1, scor2=scor2, ref=refint, env=reftst, rmse1=rmse1, rmse2=rmse2, rmse3=rmse3, rmse4=rmse5)
+    # with open(dstdir.joinpath('results.json'), 'w') as outfile:
+    #     outfile.write(json.dumps(res, indent=4))
+
+
+
+
+    pass




--- a/denoising/dataset.py
+++ b/denoising/dataset.py
+import math
+import numpy as np
 from pathlib import Path
 import pandas as pd

+def check_values(dataset):
+    print(f'{dataset.min(axis=0)=}')
+    print(f'{dataset.mean(axis=0)=}')
+    print(f'{dataset.max(axis=0)=}')
+    print(f'{dataset.std(axis=0, ddof=1)=}')
+    print(f'{dataset.sum(axis=0)=}')
+
+
 def from_csv(paths: list[Path], names=['timestamp',  'rh', 'pa', 'pm10', 'pm1', 'pm2_5', 'no2', 'temp0', 'temp1']) -> pd.DataFrame:
    dataset_list = [pd.read_csv(path, skiprows=3, header=None, sep=' ', names=names) for path in paths]
    # Merge datasets
    dataset = pd.concat(dataset_list)
+    for col in dataset.columns.tolist():
+        dataset[col] = dataset[col].astype(float)
    # Reset the shit index,  otherwise you will get duplicate!
    return dataset.reset_index(drop=True)

@@ -12,7 +24,19 @@ def apply_offset(dataset, offset):
    dataset.timestamp += offset
    return dataset

+def rmse(y_actual, y_predicted):
+    MSE = np.square(np.subtract(y_actual,y_predicted)).mean()
+    return math.sqrt(MSE)
+
+
+def evaluate_results(refint, scor5, scor4, scor3, scor2, sensor_data, reftst, tE, save_to=None):
+
+    corr_matrix = np.corrcoef(refint, scor5)
+    corr = corr_matrix[0, 1]
+    R_sq = corr ** 2
+
+    # How well does the reint match the observations scor5
+    print(f'Coefficient of determination: {R_sq}')
+


-def sensor_data():
-    pass
\ No newline at end of file
--- a/denoising/outliers.py
+++ b/denoising/outliers.py
@@ -2,7 +2,8 @@ import math
 import numpy as np
 import pywt
 from denoising import wavelets
-from typing import (List, Literal)
+from typing import Literal
+



@@ -33,7 +34,7 @@ def apply_threshold(dataset: np.array, threshold):
            else:
                loc1 = max(lt)
                loc2 = min(gt)
-                loc = [loc1, loc2]
+                loc = np.array([loc1, loc2])
            dataset[idtmp, jj] = dataset[loc, jj].mean()
    return dataset

@@ -47,9 +48,9 @@ def apply_delta_relative_threshold(dataset, reftst, threshold):
    return dataset


-def apply_wavelets_despiking(signal: np.array, sfr, wid: np.array,  L: float,
-                             wname: Literal['bior1.5','bior1.3','sym2','db2','haar'],
-                             option:Literal['drop', 'reset']='reset'):
+def get_wavelet_spikes(signal: np.array, sfr, wid: np.array, L: float,
+                       wname: Literal['bior1.5','bior1.3','sym2','db2','haar'],
+                       option:Literal['drop', 'reset']='reset'):
    # Number of time samples
    nt = signal.shape[1]
    # Make sure signal is zero-mean
@@ -73,46 +74,25 @@ def apply_wavelets_despiking(signal: np.array, sfr, wid: np.array,  L: float,

    # [ms] The refractory period -- can not resolve spikes that are closer than refract
    refract = 1.5 * wid[1]
-    # print(f'{refract=}')
    refract = round(refract * sfr)
-    # print(f'{refract=}')
-
    merge = wid.mean()
    merge = round(merge * sfr)
-    # print(f'{merge=}')
-
    # discard spikes that are located at first and last sample
    is_spike[0, [0, -1]] = False
-
    ind_ones = np.argmax(is_spike[0, :] == True)
-
    ind_ones_is_empty = not ind_ones.any()
-
    if not ind_ones_is_empty:
        # there will be 1 followed by -1 for each spike
        temp = np.diff(is_spike)
        # nominal number of spikes
        nsp = np.count_nonzero(temp == 1)
-
        # index of the beginning of a spike
        lead_t = np.where(temp == 1)[1]
-
        # index of the end of the spike
        lag_t = np.where(temp == -1)[1]
-
-        # print(f'{temp=}, {nsp=}, {lead_t=}, {lag_t=}')
-
        tE = np.zeros(nsp, dtype=int)
        for i in range(nsp):
            tE[i] = np.ceil(np.array([lead_t[i], lag_t[i]]).mean())
-            # print(f'{tE[i]=}')   # ok
-
-        # print(tE.mean())
-        # print(tE.min())
-        # print(tE.max())
-        # print(tE.std(ddof=1))
-        # print(len(tE))
-
        for i in range(len(tE) - 1):
            diff = tE[i + 1] - tE[i]
            # if (diff < refract) and (diff > merge):
@@ -123,10 +103,11 @@ def apply_wavelets_despiking(signal: np.array, sfr, wid: np.array,  L: float,
            elif diff <= merge:
                # merge
                tE[i] = np.ceil(np.array([tE[i], tE[i + 1]]).mean())
-                print(tE[i])
+
                # discard
                # TODO: How to implement????
                tE[i + 1] = []
+    return tE


 def get_spike_indexes(L, c, nt, nw, option, w):
@@ -171,21 +152,14 @@ def get_spike_indexes(L, c, nt, nw, option, w):
                ct[i, ind] = c[i, ind]
        else:
            mj = np.mean(np.abs(c[i, index]))  # mean of the signal coefficients
-            # print(f'{mj=}')
            ps = len(index) / nt  # prior of spikes
-            # print(f'{len(index)=}')
-            # print(f'{ps=}')
            pn = 1 - ps  # prior of noise
-            # print(f'{pn=}')
-            # hypothesis testing
            # eqn (7)
            loge = L + math.log(pn / ps)
            # eqn (6)
-            dth = mj / 2 + sigmaj ** 2 / mj * loge  # ecision threshold
-            # print(f'{dth=}')
+            dth = mj / 2 + sigmaj ** 2 / mj * loge  # decision threshold
            # TODO: Check that the following line makes sense in python
            dth = np.abs(dth) * (dth >= 0)  # make DTh>=0
-            # print(f'{dth=}')
            ind = np.where(np.abs(c[i, :]) > dth)[0]
            ct[i, ind] = c[i, ind]
            # print(f'{dth=}, {ind=}, {ct[i,ind]=}')
@@ -198,7 +172,8 @@ def get_spike_indexes(L, c, nt, nw, option, w):
        # make a union with coefficients from previous scales
        Index = np.logical_or(io, Index).astype(int)
        io = Index
-    return io
+
+    return Index
No results found