import xarray as xr
import pandas as pd
import numpy as np
from dask.diagnostics import ProgressBar
import time

ds = xr.open_dataset('/nfs/a68/eebjs/hardknott/drought/vpd_variables.grib',
                     engine='cfgrib')

# I got ChatGPT to write me this!
def timer(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"Function {func.__name__}",
              f"took {round(end_time - start_time)}",
              "seconds to execute.")
        return result
    return wrapper

Ignoring index file '/nfs/a68/eebjs/hardknott/drought/vpd_variables.grib.923a8.idx' older than GRIB file

from multiprocessing import cpu_count
print(cpu_count())

32

print(ds)

<xarray.Dataset>
Dimensions:     (time: 702, latitude: 1501, longitude: 3600)
Coordinates:
    number      int64 ...
  * time        (time) datetime64[ns] 1965-01-01 1965-02-01 ... 2023-06-01
    step        timedelta64[ns] ...
    surface     float64 ...
  * latitude    (latitude) float64 75.0 74.9 74.8 74.7 ... -74.8 -74.9 -75.0
  * longitude   (longitude) float64 -180.0 -179.9 -179.8 ... 179.7 179.8 179.9
    valid_time  (time) datetime64[ns] ...
Data variables:
    d2m         (time, latitude, longitude) float32 ...
    t2m         (time, latitude, longitude) float32 ...
    sp          (time, latitude, longitude) float32 ...
Attributes:
    GRIB_edition:            1
    GRIB_centre:             ecmf
    GRIB_centreDescription:  European Centre for Medium-Range Weather Forecasts
    GRIB_subCentre:          0
    Conventions:             CF-1.7
    institution:             European Centre for Medium-Range Weather Forecasts
    history:                 2024-03-21T14:46 GRIB to CDM+CF via cfgrib-0.9.1...

def calculate_vpd(ds):

    t2m_c = ds['t2m'] - 273.15
    d2m_c = ds['d2m'] - 273.15
    sp_mb = ds['sp'] / 100

    # first calculate saturated vapour pressure
    fw = 1 + 7e-4 + 3.46e-6 * sp_mb
    svp = 6.112 * fw * np.exp( (17.67 * t2m_c) / (t2m_c + 243.5) )

    # then calculate actual vapour pressure
    avp = 6.112 * fw * np.exp( (17.67 * d2m_c) / (d2m_c + 243.5) )

    # then vapour pressure deficit is:
    vpd = svp - avp

    return vpd

@timer
def calculate_vpd_without_chunking(ds):
    vpd = calculate_vpd(ds)
    return vpd

vpd = calculate_vpd_without_chunking(ds)

Function calculate_vpd_without_chunking took 334 seconds to execute.

@timer
def calculate_vpd_with_chunking(ds):

    ds = ds.chunk({'time':24})

    vpd = calculate_vpd(ds)

    with ProgressBar():
        vpd = vpd.compute(num_workers=30, scheduler='threads')
    return vpd

vpd = calculate_vpd_with_chunking(ds)

[########################################] | 100% Completed | 16.68 s
Function calculate_vpd_with_chunking took 22 seconds to execute.

@timer
def calculate_spatial_mean_vpd_without_chunking(vpd):
    mean_vpd = vpd.rolling({'time':36}).mean(dim='time')
    return mean_vpd

mean_vpd = calculate_spatial_mean_vpd_without_chunking(ds)

Function calculate_spatial_mean_vpd_without_chunking took 283 seconds to execute.

@timer
def calculate_mean_vpd_chunking_across_time(vpd):

    # chunk across time
    vpd = vpd.chunk({'time':24})

    # compute the mean
    with ProgressBar():
        mean_vpd = vpd.rolling({'time':36}).mean('time').compute(num_workers=30, scheduler='threads')
    return mean_vpd

mean_vpd = calculate_mean_vpd_chunking_across_time(vpd)

[########################################] | 100% Completed | 79.81 s
Function calculate_mean_vpd_chunking_across_time took 105 seconds to execute.

@timer
def calculate_mean_vpd_chunking_across_space(vpd):

    # chunk across time
    vpd = vpd.chunk({'latitude':300, 'longitude':600})

    # compute the mean
    with ProgressBar():
        mean_vpd = vpd.rolling({'time':36}).mean('time').compute(num_workers=30, scheduler='threads')
    return mean_vpd

mean_vpd = calculate_mean_vpd_chunking_across_space(vpd)

[########################################] | 100% Completed | 43.50 ss
Function calculate_mean_vpd_chunking_across_space took 67 seconds to execute.

da = vpd[:, :200, :200] # taking a smaller slice of the array so that the calculation is faster

def calculate_mean(da, chunksize):

    # time the chunking overhead
    start = time.time()
    da = da.chunk({'time':chunksize})
    end = time.time()
    chunking_time = end-start

    # time the actual calculation
    start = time.time()
    _ = da.mean(['latitude', 'longitude']).compute(num_workers=30, scheduler='threads')
    end = time.time()
    calculation_time = end-start

    return pd.Series(
        [chunking_time, calculation_time],
        index=['chunking time', 'calculation time']
                     )

results = pd.DataFrame()
for chunksize in [2,4,8,16,32,64,128, 256, 512]:
    results[chunksize] = calculate_mean(da, chunksize=chunksize)

results.T.plot.bar(grid=True, xlabel='chunksize', ylabel='time (seconds)',
                   stacked=True)

<Axes: xlabel='chunksize', ylabel='time (seconds)'>

da = vpd[:10] # taking a smaller slice of the array so that the calculation is faster

def calculate_mean(da, chunksize):

    # time the chunking overhead
    start = time.time()
    da = da.chunk({'latitude':chunksize, 'longitude':chunksize})
    end = time.time()
    chunking_time = end-start

    # time the actual calculation
    start = time.time()
    _ = da.mean(['time']).compute(num_workers=30, scheduler='threads')
    end = time.time()
    calculation_time = end-start

    return pd.Series(
        [chunking_time, calculation_time],
        index=['chunking time', 'calculation time']
                     )

results = pd.DataFrame()
for chunksize in [20,40,80,160,320,640,1280, 2560]:
    results[chunksize] = calculate_mean(da, chunksize=chunksize)

results.T.plot.bar(grid=True, xlabel='chunksize', ylabel='time (seconds)',
                   stacked=True)

<Axes: xlabel='chunksize', ylabel='time (seconds)'>

da = vpd[:, :200, :200] # taking a smaller slice of the array so that the calculation is faster

def calculate_mean(da, chunksize):

    # time the chunking overhead
    start = time.time()
    da = da.chunk(chunksize)
    end = time.time()
    chunking_time = end-start

    # time the actual calculation
    start = time.time()
    _ = da.mean(['latitude', 'longitude']).compute(num_workers=30, scheduler='threads')
    end = time.time()
    calculation_time = end-start

    return pd.Series(
        [chunking_time, calculation_time],
        index=['chunking time', 'calculation time']
                     )


results = pd.DataFrame()
for chunksize in ['auto',600, 300,150,75,36,18]:
    print(chunksize)
    results[chunksize] = calculate_mean(da, chunksize=chunksize)

results.T.plot.bar(grid=True, xlabel='chunksize', ylabel='time (seconds)',
                   stacked=True)

auto
600
300
150
75
36
18

<Axes: xlabel='chunksize', ylabel='time (seconds)'>

Speeding up xarray calculations using dask¶

Without using dask¶

Using dask¶

Which dimensions to chunk?¶

The effect of chunk size¶