ts-python

Seminar III: Tabular Data Persistence in Python (2022-07-22)

“I used to be with ‘it,’ but then changed what ‘it’ was. Now what I’m with isn’t ‘it’ anymore, and what’s ‘it’ seems weird and scary.”

— ‘Homerpalooza’ (S07E24)

Abstract

How do you persist tabular data; how do you store it to disk? Of all the common formats—.csv, .npy, Parquet, Feather, HDF, etc.—how do you know which to choose and what trade-offs they each present?

Do you…

Then come join us for a session on columnar file and database formats!

How do you store tabular data between analyses? How do you store important metadata along with the tabular data? What do you do about parts of your data that may require runtime support (e.g., complex indices or the use of user-defined types?) Why and when is pickle not enough—or even a bad idea?

In this episode, we’ll discuss persisting tabular data for storage to disk or transmission to other users (e.g., to-and-from API endpoints or to-and-from nodes in a distributed computation.) We’ll discuss the difference between text-based formats (e.g., csv, txt, fwf, tst, dat, json,) binary formats (e.g., pickle, npy, parquet, feather,) and database formats (e.g., hdf, sql.) We’ll look to cover which of these is most convenient and performant for persisting data, as well as the contexts when these file formats should be used or avoided.

Keywords: parquet, csv, persistence, feather, tabular, data

Notes

Question: What’s wrong with CSV?

print("Let's take a look!")
from pandas import DataFrame, Series, Index, date_range
from numpy.random import default_rng
from string import ascii_lowercase
from numpy import repeat, tile

rng = default_rng(0)

tickers = rng.choice([*ascii_lowercase], size=(10, 4)).view('<U4').ravel()
dates = date_range('2020-01-01', periods=15)
prices = (
    100 * rng.random(size=len(tickers))
  * rng.normal(loc=1, scale=0.01, size=(len(dates), len(tickers))).cumprod(axis=0)
).ravel().round(2)
volumes = rng.integers(-50_000, +50_000, size=len(dates) * len(tickers)).round(-2)

industries = Series(
    data=rng.choice(
        ['energy', 'healthcare', 'tech', 'finance'],
        size=len(tickers),
    ),
    index=Index(tickers, name='ticker'),
    name='industry',
).astype('category')

df = DataFrame({
    'date':     tile(dates, len(tickers)),
    'ticker':   repeat(tickers, len(dates)),
    'price':    prices,
    'volumes':  volumes,
}).set_index(['date', 'ticker']).join(industries)

df['signal'] = df.groupby('ticker')['price'].transform(lambda g: g > g.shift())

print(
    df.sample(3),
)

What if we want to persist the data? · immediate-term → supporting (distributed) computation · short-term → supporting ongoing analysis · medium-term → supporting collaboraton · long-term → supporting reproducibility

What’s so bad about CSV?

from pandas import DataFrame, Series, Index, date_range
from numpy.random import default_rng
from string import ascii_lowercase
from numpy import repeat, tile
from tempfile import TemporaryDirectory
from pathlib import Path
from itertools import islice

rng = default_rng(0)

tickers = rng.choice([*ascii_lowercase], size=(10, 4)).view('<U4').ravel()
dates = date_range('2020-01-01', periods=15)
prices = (
    100 * rng.random(size=len(tickers))
  * rng.normal(loc=1, scale=0.01, size=(len(dates), len(tickers))).cumprod(axis=0)
).ravel().round(2)
volumes = rng.integers(-50_000, +50_000, size=len(dates) * len(tickers)).round(-2)

industries = Series(
    data=rng.choice(
        ['energy', 'healthcare', 'tech', 'finance'],
        size=len(tickers),
    ),
    index=Index(tickers, name='ticker'),
    name='industry',
).astype('category')

df = DataFrame({
    'date':     tile(dates, len(tickers)),
    'ticker':   repeat(tickers, len(dates)),
    'price':    prices,
    'volumes':  volumes,
}).set_index(['date', 'ticker']).join(industries)

df['signal'] = df.groupby('ticker')['price'].transform(lambda g: g > g.shift())

with TemporaryDirectory() as d:
    filename = Path(d) / 'data.csv'
    df.to_csv(filename)

    with open(filename) as f:
        for line in islice(f, 5):
            print(f'{line = }')
from pandas import DataFrame, Series, Index, date_range, read_csv
from numpy.random import default_rng
from string import ascii_lowercase
from numpy import repeat, tile
from tempfile import TemporaryDirectory
from pathlib import Path
from itertools import islice

rng = default_rng(0)

tickers = rng.choice([*ascii_lowercase], size=(10, 4)).view('<U4').ravel()
dates = date_range('2020-01-01', periods=15)
prices = (
    100 * rng.random(size=len(tickers))
  * rng.normal(loc=1, scale=0.01, size=(len(dates), len(tickers))).cumprod(axis=0)
).ravel().round(2)
volumes = rng.integers(-50_000, +50_000, size=len(dates) * len(tickers)).round(-2)

industries = Series(
    data=rng.choice(
        ['energy', 'healthcare', 'tech', 'finance'],
        size=len(tickers),
    ),
    index=Index(tickers, name='ticker'),
    name='industry',
).astype('category')

df = DataFrame({
    'date':     tile(dates, len(tickers)),
    'ticker':   repeat(tickers, len(dates)),
    'price':    prices,
    'volumes':  volumes,
}).set_index(['date', 'ticker']).join(industries)

df['signal'] = df.groupby('ticker')['price'].transform(lambda g: g > g.shift())

with TemporaryDirectory() as d:
    filename = Path(d) / 'data.csv'
    df.to_csv(filename)

    df = (
        read_csv(
            filename,
            parse_dates=['date'],
            index_col=['date', 'ticker'],
        )
        .sort_index()
        .assign(
            industry=lambda df: df['industry'].astype('category'),
            price=lambda df: df['price'].astype('float32'),
            volumes=lambda df: df['volumes'].astype('int32')
        )
    )
    print(
        df,
        sep='\n'
    )
from pandas import Series, period_range, read_csv
from dataclasses import dataclass
from tempfile import TemporaryDirectory
from pathlib import Path

@dataclass
class T:
    a : int
    b : int

s = Series(
    data=[T(1, 2), T(3, 4)],
    index=period_range('2020-01-01', periods=2, freq='Q', name='quarter'),
    name='data',
)

with TemporaryDirectory() as d:
    filename = Path(d) / 'data.csv'

    s.to_csv(filename)

    before = s
    after = read_csv(filename)

    print(
        before,
        after,
        after.dtypes,
        sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
    )
from pandas import DataFrame, read_csv
from numpy.random import default_rng
from tempfile import TemporaryDirectory
from pathlib import Path

rng = default_rng(0)

df = DataFrame({
    ('US', 'GDP'): rng.normal(size=3),
    ('US', 'PPP'): rng.normal(size=3),
    ('EU', 'GDP'): rng.normal(size=3),
    ('EU', 'PPP'): rng.normal(size=3),
}).round(2)

with TemporaryDirectory() as d:
    filename = Path(d) / 'data.csv'

    df.to_csv(filename)

    before = df
    after = read_csv(filename, header=[0, 1])

    print(
        before,
        after,
        # after.dtypes,
        sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
    )
from pandas import DataFrame, date_range, MultiIndex, read_csv
from numpy.random import default_rng
from string import ascii_lowercase
from tempfile import TemporaryDirectory
from pathlib import Path

rng = default_rng(0)

dates = date_range('2020-01-01', periods=5)
tickers = rng.choice([*ascii_lowercase], size=(3, 4)).view('<U4').ravel()

df = DataFrame(
    index=(idx := MultiIndex.from_product([dates, tickers], names=['date', 'ticker'])),
    data={
        'signal1': rng.random(size=len(idx)),
        'signal2': rng.random(size=len(idx)),
    },
)

# res = df.groupby('ticker').agg('mean')
res = df.groupby('ticker').agg(['mean', 'sem'])

with TemporaryDirectory() as d:
    filename = Path(d) / 'data.csv'

    res.to_csv(filename)

    before = res
    after = read_csv(filename, index_col=['ticker'])

    print(
        before,
        after,
        sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
    )

CSV Pros: · intercommunication with other tools (e.g., humans, Excel, R, Matlab, &c.)

CSV Cons: · can’t readily encode/decode anything particularly complex · inefficient: slow to decode, large file sizes

from pandas import DataFrame, Series, Index, date_range
from numpy.random import default_rng
from string import ascii_lowercase
from numpy import repeat, tile
from tempfile import TemporaryDirectory
from pathlib import Path
from itertools import islice

rng = default_rng(0)

tickers = rng.choice([*ascii_lowercase], size=(10, 4)).view('<U4').ravel()
dates = date_range('2020-01-01', periods=15)
prices = (
    100 * rng.random(size=len(tickers))
  * rng.normal(loc=1, scale=0.01, size=(len(dates), len(tickers))).cumprod(axis=0)
).ravel().round(2)
volumes = rng.integers(-50_000, +50_000, size=len(dates) * len(tickers)).round(-2)

industries = Series(
    data=rng.choice(
        ['energy', 'healthcare', 'tech', 'finance'],
        size=len(tickers),
    ),
    index=Index(tickers, name='ticker'),
    name='industry',
).astype('category')

df = DataFrame({
    'date':     tile(dates, len(tickers)),
    'ticker':   repeat(tickers, len(dates)),
    'price':    prices,
    'volumes':  volumes,
}).set_index(['date', 'ticker']).join(industries)

df['signal'] = df.groupby('ticker')['price'].transform(lambda g: g > g.shift())

with TemporaryDirectory() as d:
    # filename = Path(d) / 'data.json'
    # df.to_json(filename, indent=2)
    # with open(filename) as f:
    #     for line in islice(f, 5):
    #         print(f'{line = }')

    filename = Path(d) / 'data.xml'
    df.to_xml(filename)
    with open(filename) as f:
        for line in islice(f, 5):
            print(f'{line = }')

    filename = Path(d) / 'data.html'
    df.to_html(filename)
    with open(filename) as f:
        for line in islice(f, 5):
            print(f'{line = }')

    filename = Path(d) / 'data.tex'
    df.to_latex(filename)
    with open(filename) as f:
        for line in islice(f, 5):
            print(f'{line = }')

Question: What’s wrong with Pickle?

print("Let's take a look!")
from pandas import DataFrame, Series, Index, period_range
from numpy.random import default_rng
from string import ascii_lowercase
from numpy import repeat, tile

rng = default_rng(0)

tickers = rng.choice([*ascii_lowercase], size=(10, 4)).view('<U4').ravel()
dates = period_range('2020-01-01', periods=15, freq='Q')
prices = (
    100 * rng.random(size=len(tickers))
  * rng.normal(loc=1, scale=0.01, size=(len(dates), len(tickers))).cumprod(axis=0)
).ravel().round(2)
volumes = rng.integers(-50_000, +50_000, size=len(dates) * len(tickers)).round(-2)

industries = Series(
    data=rng.choice(
        ['energy', 'healthcare', 'tech', 'finance'],
        size=len(tickers),
    ),
    index=Index(tickers, name='ticker'),
    name='industry',
).astype('category')

df = (
    DataFrame({
        'date':     tile(dates, len(tickers)),
        'ticker':   repeat(tickers, len(dates)),
        'price':    prices,
        'volumes':  volumes,
    })
    .set_index(['date', 'ticker'])
    .groupby('ticker').apply(
        lambda g: g.rolling(3).agg(['mean', 'std'])
    )
    .pipe(
        lambda df: df
            .join(
                df[('price', 'mean')]
                .groupby('ticker').transform(lambda g: g > g.shift()).rename(('signal', None))
            )
    )
)

print(
    df.sample(3),
)
from pandas import DataFrame, Series, Index, period_range, read_pickle
from numpy.random import default_rng
from string import ascii_lowercase
from numpy import repeat, tile
from tempfile import TemporaryDirectory
from pathlib import Path

rng = default_rng(0)

tickers = rng.choice([*ascii_lowercase], size=(10, 4)).view('<U4').ravel()
dates = period_range('2020-01-01', periods=15, freq='Q')
prices = (
    100 * rng.random(size=len(tickers))
  * rng.normal(loc=1, scale=0.01, size=(len(dates), len(tickers))).cumprod(axis=0)
).ravel().round(2)
volumes = rng.integers(-50_000, +50_000, size=len(dates) * len(tickers)).round(-2)

industries = Series(
    data=rng.choice(
        ['energy', 'healthcare', 'tech', 'finance'],
        size=len(tickers),
    ),
    index=Index(tickers, name='ticker'),
    name='industry',
).astype('category')

df = (
    DataFrame({
        'date':     tile(dates, len(tickers)),
        'ticker':   repeat(tickers, len(dates)),
        'price':    prices,
        'volumes':  volumes,
    })
    .set_index(['date', 'ticker'])
    .groupby('ticker').apply(
        lambda g: g.rolling(3).agg(['mean', 'std'])
    )
    .pipe(
        lambda df: df
            .join(
                df[('price', 'mean')]
                .groupby('ticker').transform(lambda g: g > g.shift()).rename(('signal', None))
            )
    )
)

with TemporaryDirectory() as d:
    filename = Path(d) / 'data.pkl'

    df.to_pickle(filename)

    before = df
    after = read_pickle(filename)

    print(
        before,
        after,
        sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
    )
from dataclasses import dataclass
from pickle import dump
from tempfile import TemporaryDirectory
from pathlib import Path
from textwrap import dedent
from subprocess import run

@dataclass
class T:
    a : int
    b : int

obj = T(123, 456)

with TemporaryDirectory() as d:
    filename = Path(d) / 'obj.pkl'

    with open(filename, 'wb') as f:
        dump(obj, f)

    run(['python2.7', '-c', dedent('''

        from sys import argv
        from pickle import load

        with open(argv[1], 'rb') as f:
            print(load(f))

    '''), filename])
from pickle import loads

data = b'c__builtin__\neval\n(V__import__("subprocess").run(["file", "/etc/passwd"])\ntR.'
loads(data)

Pickle Pros: · can serialise practically anything (stateless)

Pickle Cons: · insecure, unstable, Python-specific · relatively inefficient

Question: What about Feather?

print("Let's take a look!")
from pandas import DataFrame, Series, Index, date_range
from numpy.random import default_rng
from string import ascii_lowercase
from numpy import repeat, tile
from pyarrow.feather import write_feather
from tempfile import TemporaryDirectory
from pathlib import Path

rng = default_rng(0)

tickers = rng.choice([*ascii_lowercase], size=(10, 4)).view('<U4').ravel()
dates = date_range('2020-01-01', periods=15)
prices = (
    100 * rng.random(size=len(tickers))
  * rng.normal(loc=1, scale=0.01, size=(len(dates), len(tickers))).cumprod(axis=0)
).ravel().round(2)
volumes = rng.integers(-50_000, +50_000, size=len(dates) * len(tickers)).round(-2)

industries = Series(
    data=rng.choice(
        ['energy', 'healthcare', 'tech', 'finance'],
        size=len(tickers),
    ),
    index=Index(tickers, name='ticker'),
    name='industry',
).astype('category')

df = (
    DataFrame({
        'date':     tile(dates, len(tickers)),
        'ticker':   repeat(tickers, len(dates)),
        'price':    prices,
        'volumes':  volumes,
    })
    .set_index(['date', 'ticker'])
    .pipe(
        lambda df: df
            .join(
                df['price']
                .groupby('ticker').transform(lambda g: g > g.shift()).rename('signal')
            )
    )
)

with TemporaryDirectory() as d:
    filename = Path(d) / 'data.feather'

    write_feather(df, filename)

    with open(filename, 'rb') as f:
        print(f.read(100))

“Feather is a portable file format for storing Arrow tables or data frames (from languages like Python or R) that utilizes the Arrow IPC format internally. Feather was created early in the Arrow project as a proof of concept for fast, language-agnostic data frame storage for Python (pandas)”

from pandas import DataFrame, Series, Index, date_range
from numpy.random import default_rng
from string import ascii_lowercase
from numpy import repeat, tile
from pyarrow.feather import write_feather, read_feather
from tempfile import TemporaryDirectory
from pathlib import Path

rng = default_rng(0)

tickers = rng.choice([*ascii_lowercase], size=(10, 4)).view('<U4').ravel()
dates = date_range('2020-01-01', periods=15)
prices = (
    100 * rng.random(size=len(tickers))
  * rng.normal(loc=1, scale=0.01, size=(len(dates), len(tickers))).cumprod(axis=0)
).ravel().round(2)
volumes = rng.integers(-50_000, +50_000, size=len(dates) * len(tickers)).round(-2)

industries = Series(
    data=rng.choice(
        ['energy', 'healthcare', 'tech', 'finance'],
        size=len(tickers),
    ),
    index=Index(tickers, name='ticker'),
    name='industry',
).astype('category')

df = (
    DataFrame({
        'date':     tile(dates, len(tickers)),
        'ticker':   repeat(tickers, len(dates)),
        'price':    prices,
        'volumes':  volumes,
    })
    .set_index(['date', 'ticker'])
    .pipe(
        lambda df: df
            .join(
                df['price']
                .groupby('ticker').transform(lambda g: g > g.shift()).rename('signal')
            )
    )
)

with TemporaryDirectory() as d:
    filename = Path(d) / 'data.feather'

    write_feather(df, filename, compression='zstd')
    before = df
    after = read_feather(filename)

    print(
        before,
        after,
        sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
    )
from pandas import Series, date_range, read_csv
from numpy.random import default_rng
from numpy import allclose
from pyarrow.feather import write_feather, read_feather
from tempfile import TemporaryDirectory
from pathlib import Path
from humanize import naturalsize
from _util import timed

rng = default_rng(0)

s = Series(
    index=(idx := date_range('2000-01-01', freq='1T', periods=500_000)),
    data=rng.random(size=len(idx)),
    name='signal',
).rename_axis('date')

with TemporaryDirectory() as d:
    filename = Path(d) / 'data.csv.gz'
    with timed('.to_csv'):
        s.to_csv(filename, compression='gzip')
    before = s
    with timed('read_csv'):
        after = read_csv(
            filename,
            index_col=['date'],
            parse_dates=['date'],
        ).squeeze()
    assert (before.index == after.index).all() and allclose(before, after)
    print(f'{naturalsize(filename.stat().st_size) = }')

    # filename = Path(d) / 'data.feather'
    # with timed('write_feather'):
    #     write_feather(s, filename)

    filename = Path(d) / 'data.feather'
    with timed('write_feather'):
        write_feather(s.to_frame(), filename, compression='zstd')
    before = s.to_frame()
    with timed('read_feather'):
        after = read_feather(filename)
    assert (before.index == after.index).all() and (before.values == after.values).all()
    print(f'{naturalsize(filename.stat().st_size) = }')
from pandas import period_range, DataFrame, MultiIndex, array
from numpy.random import default_rng
from string import ascii_lowercase
from pyarrow.feather import write_feather, read_feather
from tempfile import TemporaryDirectory
from pathlib import Path

rng = default_rng(0)

df = DataFrame(
    index=(idx := MultiIndex.from_product([
        ['a', 'b', 'c'],
        period_range('2020-01-01', freq='Q', periods=3),
    ], names=['entity', 'quarter'])),
    data={
        'a': rng.normal(size=len(idx)),
        'b': rng.normal(size=len(idx)),
        'c': array(
            rng.choice([*ascii_lowercase], size=(len(idx), 4)).view('<U4').ravel()
        ).astype('category'),
    },
)

with TemporaryDirectory() as d:
    filename = Path(d) / 'data.feather'

    before = df
    write_feather(df, filename)
    after = read_feather(filename)

    print(
        before.head(4),
        after.head(4),
        after.dtypes,
        sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
    )
from dataclasses import dataclass
from pandas import DataFrame
from pyarrow.feather import write_feather, read_feather
from tempfile import TemporaryDirectory
from pathlib import Path

@dataclass
class T:
     a : int = 0
     b : int = 0

df = DataFrame({
    'x': [T(), T()],
    'y': [T(), T()],
})

with TemporaryDirectory() as d:
    filename = Path(d) / 'data.feather'

    before = df
    write_feather(df, filename)
    after = read_feather(filename)

    print(
        before.head(4),
        after.head(4),
        sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
    )

Feather Pros: · you may already be using Arrow (!) · relatively small file-size · interoperability (e.g., R, Rust) · fast

Feather Cons: · less general; harder cliff · not generally designed for archival use

from pandas import Series, DataFrame, date_range, MultiIndex
from numpy.random import default_rng
from pyarrow.feather import write_feather, read_feather
from io import BytesIO
from tempfile import TemporaryDirectory
from pathlib import Path
from tarfile import open as tf_open, TarInfo

rng = default_rng(0)

df = DataFrame(
    index=(idx := MultiIndex.from_product([
        ['a', 'b', 'c'],
        date_range('2020-01-01', periods=3),
    ], names=['entity', 'quarter'])),
    data={
        'signal': rng.normal(size=len(idx)),
        'value': rng.normal(size=len(idx)),
    },
)

s = Series(['x', 'y', 'z'], index=['a', 'b', 'c'])

print(
    df,
    s,
    sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)

with TemporaryDirectory() as d:
    filename = Path(d) / 'data.tar'

    with tf_open(filename, 'w') as f:
        write_feather(df, out := BytesIO(), compression='zstd')
        f.addfile(TarInfo(name='df'), out)

        write_feather(s.to_frame(), out := BytesIO(), compression='zstd')
        f.addfile(TarInfo(name='s'), out)
from numpy.random import default_rng
from pyarrow.feather import write_feather
from tempfile import TemporaryDirectory
from pathlib import Path

rng = default_rng(0)

data = rng.normal(size=(3, 3, 3))

with TemporaryDirectory() as d:
    filename = Path(d) / 'data.feather'

    write_feather(data, filename)

What about Parquet?

“Apache Parquet is an open source, column-oriented data file format designed for efficient data storage and retrieval. It provides efficient data compression and encoding schemes with enhanced performance to handle complex data in bulk.”

from pandas import DataFrame, Series, Index, date_range
from numpy.random import default_rng
from string import ascii_lowercase
from numpy import repeat, tile
from tempfile import TemporaryDirectory
from pathlib import Path

rng = default_rng(0)

tickers = rng.choice([*ascii_lowercase], size=(10, 4)).view('<U4').ravel()
dates = date_range('2020-01-01', periods=15)
prices = (
    100 * rng.random(size=len(tickers))
  * rng.normal(loc=1, scale=0.01, size=(len(dates), len(tickers))).cumprod(axis=0)
).ravel().round(2)
volumes = rng.integers(-50_000, +50_000, size=len(dates) * len(tickers)).round(-2)

industries = Series(
    data=rng.choice(
        ['energy', 'healthcare', 'tech', 'finance'],
        size=len(tickers),
    ),
    index=Index(tickers, name='ticker'),
    name='industry',
).astype('category')

df = (
    DataFrame({
        'date':     tile(dates, len(tickers)),
        'ticker':   repeat(tickers, len(dates)),
        'price':    prices,
        'volumes':  volumes,
    })
    .set_index(['date', 'ticker'])
    .pipe(
        lambda df: df
            .join(
                df['price']
                .groupby('ticker').transform(lambda g: g > g.shift()).rename('signal')
            )
    )
)

with TemporaryDirectory() as d:
    filename = Path(d) / 'data.parquet'

    df.to_parquet(filename)

    with open(filename, 'rb') as f:
        print(f.read(100))
from pandas import Series, date_range, read_parquet
from numpy.random import default_rng
from numpy import allclose
from pyarrow.feather import write_feather, read_feather
from tempfile import TemporaryDirectory
from pathlib import Path
from humanize import naturalsize
from _util import timed

rng = default_rng(0)

s = Series(
    index=(idx := date_range('2000-01-01', freq='1T', periods=500_000)),
    data=rng.random(size=len(idx)),
    name='signal',
).rename_axis('date')

with TemporaryDirectory() as d:
    filename = Path(d) / 'data.feather'
    with timed('write_feather'):
        write_feather(s.to_frame(), filename, compression='zstd')
    before = s.to_frame()
    with timed('read_feather'):
        after = read_feather(filename)
    assert (before.index == after.index).all() and (before.values == after.values).all()
    print(f'{naturalsize(filename.stat().st_size) = }')

    filename = Path(d) / 'data.parquet'
    with timed('.to_parquet'):
        # s.to_frame().to_parquet(filename)
        s.to_frame().to_parquet(filename, compression='brotli')
    before = s.to_frame()
    with timed('read_parquet'):
        after = read_parquet(filename)
    assert (before.index == after.index).all() and (before.values == after.values).all()
    print(f'{naturalsize(filename.stat().st_size) = }')
from dataclasses import dataclass
from pandas import DataFrame
from tempfile import TemporaryDirectory
from pathlib import Path

@dataclass
class T:
     a : int = 0
     b : int = 0

df = DataFrame({
    'x': [T(), T()],
    'y': [T(), T()],
})

with TemporaryDirectory() as d:
    filename = Path(d) / 'data.parquet'
    df.to_parquet(filename)

Question: What about Parquet?

print("Let's take a look!")
from pandas import DataFrame, Series, Index, date_range
from numpy.random import default_rng
from string import ascii_lowercase
from numpy import repeat, tile
from tempfile import TemporaryDirectory
from pathlib import Path

rng = default_rng(0)

tickers = rng.choice([*ascii_lowercase], size=(10, 4)).view('<U4').ravel()
dates = date_range('2020-01-01', periods=15)
prices = (
    100 * rng.random(size=len(tickers))
  * rng.normal(loc=1, scale=0.01, size=(len(dates), len(tickers))).cumprod(axis=0)
).ravel().round(2)
volumes = rng.integers(-50_000, +50_000, size=len(dates) * len(tickers)).round(-2)

industries = Series(
    data=rng.choice(
        ['energy', 'healthcare', 'tech', 'finance'],
        size=len(tickers),
    ),
    index=Index(tickers, name='ticker'),
    name='industry',
).astype('category')

df = (
    DataFrame({
        'date':     tile(dates, len(tickers)),
        'ticker':   repeat(tickers, len(dates)),
        'price':    prices,
        'volumes':  volumes,
    })
    .set_index(['date', 'ticker'])
    .pipe(
        lambda df: df
            .join(
                df['price']
                .groupby('ticker').transform(lambda g: g > g.shift()).rename('signal')
            )
    )
)

with TemporaryDirectory() as d:
    filename = Path(d) / 'data.hdf'

    df.to_hdf(filename, key='df')

    with open(filename, 'rb') as f:
        print(f.read(100))
from pandas import Series, date_range, read_parquet, read_hdf
from numpy.random import default_rng
from numpy import allclose
from pyarrow.feather import write_feather, read_feather
from tempfile import TemporaryDirectory
from pathlib import Path
from humanize import naturalsize
from _util import timed

rng = default_rng(0)

s = Series(
    index=(idx := date_range('2000-01-01', freq='1T', periods=500_000)),
    data=rng.random(size=len(idx)),
    name='signal',
).rename_axis('date')

with TemporaryDirectory() as d:
    filename = Path(d) / 'data.hdf'
    with timed('.to_hdf'):
        # s.to_hdf(filename, key='s')
        s.to_hdf(filename, key='s', complib='blosc', complevel=9)
    before = s
    with timed('read_hdf'):
        after = read_hdf(filename, key='s')
    assert (before.index == after.index).all() and (before.values == after.values).all()
    print(f'{naturalsize(filename.stat().st_size) = }')

    filename = Path(d) / 'data.feather'
    with timed('write_feather'):
        write_feather(s.to_frame(), filename, compression='zstd')
    before = s.to_frame()
    with timed('read_feather'):
        after = read_feather(filename)
    assert (before.index == after.index).all() and (before.values == after.values).all()
    print(f'{naturalsize(filename.stat().st_size) = }')

    filename = Path(d) / 'data.parquet'
    with timed('.to_parquet'):
        s.to_frame().to_parquet(filename, compression='brotli')
    before = s.to_frame()
    with timed('read_parquet'):
        after = read_parquet(filename)
    assert (before.index == after.index).all() and (before.values == after.values).all()
    print(f'{naturalsize(filename.stat().st_size) = }')
from dataclasses import dataclass
from pandas import DataFrame
from tempfile import TemporaryDirectory
from pathlib import Path

@dataclass
class T:
     a : int = 0
     b : int = 0

df = DataFrame({
    'x': [T(), T()],
    'y': [T(), T()],
})

with TemporaryDirectory() as d:
    filename = Path(d) / 'data.hdf'
    df.to_hdf(filename, key='df', complib='blosc', complevel=9)
from dataclasses import dataclass
from pandas import DataFrame, read_hdf
from tempfile import TemporaryDirectory
from pathlib import Path
from warnings import catch_warnings, filterwarnings

@dataclass
class T:
     a : int = 0
     b : int = 0

df = DataFrame({
    'x': [T(), T()],
    'y': [T(), T()],
})

with TemporaryDirectory() as d:
    filename = Path(d) / 'data.hdf'
    with catch_warnings():
        filterwarnings('ignore')
        df.to_hdf(filename, key='df', complib='blosc', complevel=9)

    before = df
    after = read_hdf(filename, key='df')

    print(
        before,
        after,
        sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
    )
from pandas import Series, DataFrame, date_range, MultiIndex, read_hdf
from numpy.random import default_rng
from tempfile import TemporaryDirectory
from pathlib import Path
from tables import open_file

rng = default_rng(0)

df = DataFrame(
    index=(idx := MultiIndex.from_product([
        ['a', 'b', 'c'],
        date_range('2020-01-01', periods=3),
    ], names=['entity', 'date'])),
    data={
        'signal': rng.normal(size=len(idx)),
        'value': rng.normal(size=len(idx)),
    },
)

s = Series(['x', 'y', 'z'], index=['a', 'b', 'c'])

print(
    df,
    s,
    sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)

with TemporaryDirectory() as d:
    filename = Path(d) / 'data.hdf'
    df.to_hdf(filename, key='df', complib='blosc', complevel=9)
    s.to_hdf(filename, key='s', complib='blosc', complevel=9)
from numpy.random import default_rng
from tables import open_file
from tempfile import TemporaryDirectory
from pathlib import Path

rng = default_rng(0)

xs = rng.normal(size=(3, 3, 3))

with TemporaryDirectory() as d:
    filename = Path(d) / 'data.hdf'
    with open_file(filename, 'a') as f:
        f.create_array(f.root, 'xs', xs)

HDF Pros: · comparable sizes · full filesystem · relatively general (can store n-dimensional data; defaults to pickle for arbitrary data)

HDF Cons: · relatively slow

from pandas import DataFrame, MultiIndex, date_range, read_parquet, read_hdf
from numpy.random import default_rng
from pyarrow.feather import write_feather, read_feather
from tempfile import TemporaryDirectory
from pathlib import Path

rng = default_rng(0)

df = DataFrame(
    index=(idx := MultiIndex.from_product([
        ['a', 'b', 'c'],
        date_range('2020-01-01', periods=3),
    ], names=['entity', 'date'])),
    data={
        'x': rng.random(size=len(idx)),
        'y': rng.random(size=len(idx)),
    }
)
df.attrs = {'metadata': 'some metadata'}
df = df.groupby('date').sum()

with TemporaryDirectory() as d:
    filename = Path(d) / 'data.fea'
    write_feather(df, filename)
    after = read_feather(filename)
    print(f'Feather: {after.attrs = }')

    filename = Path(d) / 'data.pkl'
    df.to_parquet(filename)
    after = read_parquet(filename)
    print(f'Parquet: {after.attrs = }')

    filename = Path(d) / 'data.hdf'
    df.to_hdf(filename, key='df')
    after = read_hdf(filename, key='df')
    print(f'HDF: {after.attrs = }')
from numpy.random import default_rng
from h5py import File
from tempfile import TemporaryDirectory
from pathlib import Path

rng = default_rng(0)

with TemporaryDirectory() as d:
    filename = Path(d) / 'data.hdf'
    with File(filename, 'a') as f:
        f['data'] = ...

Question: What about HDF?

print("Let's take a look!")
from numpy.random import default_rng

rng = default_rng(0)

xs = rng.normal(size=(3, 3, 3))
ys = rng.normal(size=(2, 1, 1, 3))

print(
    xs,
    ys,
    # xs * ys,
    sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from numpy.random import default_rng
from xarray import DataArray

rng = default_rng(0)

xs = DataArray(
    data=rng.normal(size=(3, 3, 3)),
    dims=[*'abc'],
)
ys = DataArray(
    data=rng.normal(size=(2, 1, 1, 3)),
    dims=[*'defc'],
)

print(
    xs,
    ys,
    sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from numpy.random import default_rng
from xarray import DataArray, Dataset

rng = default_rng(0)

xs = DataArray(
    data=rng.normal(size=(3, 3, 3)),
    dims=[*'abc'],
)
ys = DataArray(
    data=rng.normal(size=(2, 1, 1, 3)),
    dims=[*'defc'],
)

ds = Dataset({'xs': xs, 'ys': ys})

print(
    ds,
    sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from numpy.random import default_rng
from xarray import DataArray, Dataset
from tempfile import TemporaryDirectory
from pathlib import Path

rng = default_rng(0)

xs = DataArray(
    data=rng.normal(size=(3, 3, 3)),
    dims=[*'abc'],
)
ys = DataArray(
    data=rng.normal(size=(2, 1, 1, 3)),
    dims=[*'defc'],
)

ds = Dataset({'xs': xs, 'ys': ys})

with TemporaryDirectory() as d:
    filename = Path(d) / 'data.cdf'
    ds.to_netcdf(filename)
from zarr import open as zarr_open, group
from numcodecs import Blosc
from tempfile import TemporaryDirectory
from pathlib import Path
from numpy.random import default_rng

rng = default_rng(0)

with TemporaryDirectory() as d:
    filename = Path(d) / 'data.zarr'
    z = zarr_open(
        filename,
        mode='w',
        shape=(sh := (10_000, 10_000)),
        chunks=(1_000, 1_000),
        dtype=(dt := 'uint8'),
        compressor=Blosc(
            clevel=9,
        ),
    )
    z[:] = rng.integers(0, 255, size=sh, dtype=dt)
    print(z.info)
from pandas import DataFrame, Series, Index, date_range
from numpy.random import default_rng
from string import ascii_lowercase
from numpy import repeat, tile
from tempfile import TemporaryDirectory
from pathlib import Path
from sqlite3 import connect

rng = default_rng(0)

tickers = rng.choice([*ascii_lowercase], size=(10, 4)).view('<U4').ravel()
dates = date_range('2020-01-01', periods=15)
prices = (
    100 * rng.random(size=len(tickers))
  * rng.normal(loc=1, scale=0.01, size=(len(dates), len(tickers))).cumprod(axis=0)
).ravel().round(2)
volumes = rng.integers(-50_000, +50_000, size=len(dates) * len(tickers)).round(-2)

industries = Series(
    data=rng.choice(
        ['energy', 'healthcare', 'tech', 'finance'],
        size=len(tickers),
    ),
    index=Index(tickers, name='ticker'),
    name='industry',
).astype('category')

df = DataFrame({
    'date':     tile(dates, len(tickers)),
    'ticker':   repeat(tickers, len(dates)),
    'price':    prices,
    'volumes':  volumes,
}).set_index(['date', 'ticker']).join(industries)

df['signal'] = df.groupby('ticker')['price'].transform(lambda g: g > g.shift())

with TemporaryDirectory() as d:
    filename = Path(d) / 'data.sql'

    with connect(filename) as db:
        df.to_sql('df', db)

    with connect(filename) as db:
        print([*db.execute('select * from df')])