“I used to be with ‘it,’ but then changed what ‘it’ was. Now what I’m with isn’t ‘it’ anymore, and what’s ‘it’ seems weird and scary.”
— ‘Homerpalooza’ (S07E24)
How do you persist tabular data; how do you store it to disk? Of all the common formats—.csv, .npy, Parquet, Feather, HDF, etc.—how do you know which to choose and what trade-offs they each present?
Do you…
Then come join us for a session on columnar file and database formats!
How do you store tabular data between analyses? How do you store important
metadata along with the tabular data? What do you do about parts of your data
that may require runtime support (e.g., complex indices or the use of
user-defined types?) Why and when is pickle not enough—or even a bad idea?
In this episode, we’ll discuss persisting tabular data for storage to disk or
transmission to other users (e.g., to-and-from API endpoints or to-and-from
nodes in a distributed computation.) We’ll discuss the difference between
text-based formats (e.g., csv, txt, fwf, tst, dat, json,) binary
formats (e.g., pickle, npy, parquet, feather,) and database formats
(e.g., hdf, sql.) We’ll look to cover which of these is most convenient and
performant for persisting data, as well as the contexts when these file formats
should be used or avoided.
Keywords: parquet, csv, persistence, feather, tabular, data
print("Let's take a look!")
from pandas import DataFrame, Series, Index, date_range
from numpy.random import default_rng
from string import ascii_lowercase
from numpy import repeat, tile
rng = default_rng(0)
tickers = rng.choice([*ascii_lowercase], size=(10, 4)).view('<U4').ravel()
dates = date_range('2020-01-01', periods=15)
prices = (
100 * rng.random(size=len(tickers))
* rng.normal(loc=1, scale=0.01, size=(len(dates), len(tickers))).cumprod(axis=0)
).ravel().round(2)
volumes = rng.integers(-50_000, +50_000, size=len(dates) * len(tickers)).round(-2)
industries = Series(
data=rng.choice(
['energy', 'healthcare', 'tech', 'finance'],
size=len(tickers),
),
index=Index(tickers, name='ticker'),
name='industry',
).astype('category')
df = DataFrame({
'date': tile(dates, len(tickers)),
'ticker': repeat(tickers, len(dates)),
'price': prices,
'volumes': volumes,
}).set_index(['date', 'ticker']).join(industries)
df['signal'] = df.groupby('ticker')['price'].transform(lambda g: g > g.shift())
print(
df.sample(3),
)
What if we want to persist the data? · immediate-term → supporting (distributed) computation · short-term → supporting ongoing analysis · medium-term → supporting collaboraton · long-term → supporting reproducibility
What’s so bad about CSV?
from pandas import DataFrame, Series, Index, date_range
from numpy.random import default_rng
from string import ascii_lowercase
from numpy import repeat, tile
from tempfile import TemporaryDirectory
from pathlib import Path
from itertools import islice
rng = default_rng(0)
tickers = rng.choice([*ascii_lowercase], size=(10, 4)).view('<U4').ravel()
dates = date_range('2020-01-01', periods=15)
prices = (
100 * rng.random(size=len(tickers))
* rng.normal(loc=1, scale=0.01, size=(len(dates), len(tickers))).cumprod(axis=0)
).ravel().round(2)
volumes = rng.integers(-50_000, +50_000, size=len(dates) * len(tickers)).round(-2)
industries = Series(
data=rng.choice(
['energy', 'healthcare', 'tech', 'finance'],
size=len(tickers),
),
index=Index(tickers, name='ticker'),
name='industry',
).astype('category')
df = DataFrame({
'date': tile(dates, len(tickers)),
'ticker': repeat(tickers, len(dates)),
'price': prices,
'volumes': volumes,
}).set_index(['date', 'ticker']).join(industries)
df['signal'] = df.groupby('ticker')['price'].transform(lambda g: g > g.shift())
with TemporaryDirectory() as d:
filename = Path(d) / 'data.csv'
df.to_csv(filename)
with open(filename) as f:
for line in islice(f, 5):
print(f'{line = }')
from pandas import DataFrame, Series, Index, date_range, read_csv
from numpy.random import default_rng
from string import ascii_lowercase
from numpy import repeat, tile
from tempfile import TemporaryDirectory
from pathlib import Path
from itertools import islice
rng = default_rng(0)
tickers = rng.choice([*ascii_lowercase], size=(10, 4)).view('<U4').ravel()
dates = date_range('2020-01-01', periods=15)
prices = (
100 * rng.random(size=len(tickers))
* rng.normal(loc=1, scale=0.01, size=(len(dates), len(tickers))).cumprod(axis=0)
).ravel().round(2)
volumes = rng.integers(-50_000, +50_000, size=len(dates) * len(tickers)).round(-2)
industries = Series(
data=rng.choice(
['energy', 'healthcare', 'tech', 'finance'],
size=len(tickers),
),
index=Index(tickers, name='ticker'),
name='industry',
).astype('category')
df = DataFrame({
'date': tile(dates, len(tickers)),
'ticker': repeat(tickers, len(dates)),
'price': prices,
'volumes': volumes,
}).set_index(['date', 'ticker']).join(industries)
df['signal'] = df.groupby('ticker')['price'].transform(lambda g: g > g.shift())
with TemporaryDirectory() as d:
filename = Path(d) / 'data.csv'
df.to_csv(filename)
df = (
read_csv(
filename,
parse_dates=['date'],
index_col=['date', 'ticker'],
)
.sort_index()
.assign(
industry=lambda df: df['industry'].astype('category'),
price=lambda df: df['price'].astype('float32'),
volumes=lambda df: df['volumes'].astype('int32')
)
)
print(
df,
sep='\n'
)
from pandas import Series, period_range, read_csv
from dataclasses import dataclass
from tempfile import TemporaryDirectory
from pathlib import Path
@dataclass
class T:
a : int
b : int
s = Series(
data=[T(1, 2), T(3, 4)],
index=period_range('2020-01-01', periods=2, freq='Q', name='quarter'),
name='data',
)
with TemporaryDirectory() as d:
filename = Path(d) / 'data.csv'
s.to_csv(filename)
before = s
after = read_csv(filename)
print(
before,
after,
after.dtypes,
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from pandas import DataFrame, read_csv
from numpy.random import default_rng
from tempfile import TemporaryDirectory
from pathlib import Path
rng = default_rng(0)
df = DataFrame({
('US', 'GDP'): rng.normal(size=3),
('US', 'PPP'): rng.normal(size=3),
('EU', 'GDP'): rng.normal(size=3),
('EU', 'PPP'): rng.normal(size=3),
}).round(2)
with TemporaryDirectory() as d:
filename = Path(d) / 'data.csv'
df.to_csv(filename)
before = df
after = read_csv(filename, header=[0, 1])
print(
before,
after,
# after.dtypes,
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from pandas import DataFrame, date_range, MultiIndex, read_csv
from numpy.random import default_rng
from string import ascii_lowercase
from tempfile import TemporaryDirectory
from pathlib import Path
rng = default_rng(0)
dates = date_range('2020-01-01', periods=5)
tickers = rng.choice([*ascii_lowercase], size=(3, 4)).view('<U4').ravel()
df = DataFrame(
index=(idx := MultiIndex.from_product([dates, tickers], names=['date', 'ticker'])),
data={
'signal1': rng.random(size=len(idx)),
'signal2': rng.random(size=len(idx)),
},
)
# res = df.groupby('ticker').agg('mean')
res = df.groupby('ticker').agg(['mean', 'sem'])
with TemporaryDirectory() as d:
filename = Path(d) / 'data.csv'
res.to_csv(filename)
before = res
after = read_csv(filename, index_col=['ticker'])
print(
before,
after,
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
CSV Pros: · intercommunication with other tools (e.g., humans, Excel, R, Matlab, &c.)
CSV Cons: · can’t readily encode/decode anything particularly complex · inefficient: slow to decode, large file sizes
from pandas import DataFrame, Series, Index, date_range
from numpy.random import default_rng
from string import ascii_lowercase
from numpy import repeat, tile
from tempfile import TemporaryDirectory
from pathlib import Path
from itertools import islice
rng = default_rng(0)
tickers = rng.choice([*ascii_lowercase], size=(10, 4)).view('<U4').ravel()
dates = date_range('2020-01-01', periods=15)
prices = (
100 * rng.random(size=len(tickers))
* rng.normal(loc=1, scale=0.01, size=(len(dates), len(tickers))).cumprod(axis=0)
).ravel().round(2)
volumes = rng.integers(-50_000, +50_000, size=len(dates) * len(tickers)).round(-2)
industries = Series(
data=rng.choice(
['energy', 'healthcare', 'tech', 'finance'],
size=len(tickers),
),
index=Index(tickers, name='ticker'),
name='industry',
).astype('category')
df = DataFrame({
'date': tile(dates, len(tickers)),
'ticker': repeat(tickers, len(dates)),
'price': prices,
'volumes': volumes,
}).set_index(['date', 'ticker']).join(industries)
df['signal'] = df.groupby('ticker')['price'].transform(lambda g: g > g.shift())
with TemporaryDirectory() as d:
# filename = Path(d) / 'data.json'
# df.to_json(filename, indent=2)
# with open(filename) as f:
# for line in islice(f, 5):
# print(f'{line = }')
filename = Path(d) / 'data.xml'
df.to_xml(filename)
with open(filename) as f:
for line in islice(f, 5):
print(f'{line = }')
filename = Path(d) / 'data.html'
df.to_html(filename)
with open(filename) as f:
for line in islice(f, 5):
print(f'{line = }')
filename = Path(d) / 'data.tex'
df.to_latex(filename)
with open(filename) as f:
for line in islice(f, 5):
print(f'{line = }')
print("Let's take a look!")
from pandas import DataFrame, Series, Index, period_range
from numpy.random import default_rng
from string import ascii_lowercase
from numpy import repeat, tile
rng = default_rng(0)
tickers = rng.choice([*ascii_lowercase], size=(10, 4)).view('<U4').ravel()
dates = period_range('2020-01-01', periods=15, freq='Q')
prices = (
100 * rng.random(size=len(tickers))
* rng.normal(loc=1, scale=0.01, size=(len(dates), len(tickers))).cumprod(axis=0)
).ravel().round(2)
volumes = rng.integers(-50_000, +50_000, size=len(dates) * len(tickers)).round(-2)
industries = Series(
data=rng.choice(
['energy', 'healthcare', 'tech', 'finance'],
size=len(tickers),
),
index=Index(tickers, name='ticker'),
name='industry',
).astype('category')
df = (
DataFrame({
'date': tile(dates, len(tickers)),
'ticker': repeat(tickers, len(dates)),
'price': prices,
'volumes': volumes,
})
.set_index(['date', 'ticker'])
.groupby('ticker').apply(
lambda g: g.rolling(3).agg(['mean', 'std'])
)
.pipe(
lambda df: df
.join(
df[('price', 'mean')]
.groupby('ticker').transform(lambda g: g > g.shift()).rename(('signal', None))
)
)
)
print(
df.sample(3),
)
from pandas import DataFrame, Series, Index, period_range, read_pickle
from numpy.random import default_rng
from string import ascii_lowercase
from numpy import repeat, tile
from tempfile import TemporaryDirectory
from pathlib import Path
rng = default_rng(0)
tickers = rng.choice([*ascii_lowercase], size=(10, 4)).view('<U4').ravel()
dates = period_range('2020-01-01', periods=15, freq='Q')
prices = (
100 * rng.random(size=len(tickers))
* rng.normal(loc=1, scale=0.01, size=(len(dates), len(tickers))).cumprod(axis=0)
).ravel().round(2)
volumes = rng.integers(-50_000, +50_000, size=len(dates) * len(tickers)).round(-2)
industries = Series(
data=rng.choice(
['energy', 'healthcare', 'tech', 'finance'],
size=len(tickers),
),
index=Index(tickers, name='ticker'),
name='industry',
).astype('category')
df = (
DataFrame({
'date': tile(dates, len(tickers)),
'ticker': repeat(tickers, len(dates)),
'price': prices,
'volumes': volumes,
})
.set_index(['date', 'ticker'])
.groupby('ticker').apply(
lambda g: g.rolling(3).agg(['mean', 'std'])
)
.pipe(
lambda df: df
.join(
df[('price', 'mean')]
.groupby('ticker').transform(lambda g: g > g.shift()).rename(('signal', None))
)
)
)
with TemporaryDirectory() as d:
filename = Path(d) / 'data.pkl'
df.to_pickle(filename)
before = df
after = read_pickle(filename)
print(
before,
after,
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from dataclasses import dataclass
from pickle import dump
from tempfile import TemporaryDirectory
from pathlib import Path
from textwrap import dedent
from subprocess import run
@dataclass
class T:
a : int
b : int
obj = T(123, 456)
with TemporaryDirectory() as d:
filename = Path(d) / 'obj.pkl'
with open(filename, 'wb') as f:
dump(obj, f)
run(['python2.7', '-c', dedent('''
from sys import argv
from pickle import load
with open(argv[1], 'rb') as f:
print(load(f))
'''), filename])
from pickle import loads
data = b'c__builtin__\neval\n(V__import__("subprocess").run(["file", "/etc/passwd"])\ntR.'
loads(data)
Pickle Pros: · can serialise practically anything (stateless)
Pickle Cons: · insecure, unstable, Python-specific · relatively inefficient
print("Let's take a look!")
from pandas import DataFrame, Series, Index, date_range
from numpy.random import default_rng
from string import ascii_lowercase
from numpy import repeat, tile
from pyarrow.feather import write_feather
from tempfile import TemporaryDirectory
from pathlib import Path
rng = default_rng(0)
tickers = rng.choice([*ascii_lowercase], size=(10, 4)).view('<U4').ravel()
dates = date_range('2020-01-01', periods=15)
prices = (
100 * rng.random(size=len(tickers))
* rng.normal(loc=1, scale=0.01, size=(len(dates), len(tickers))).cumprod(axis=0)
).ravel().round(2)
volumes = rng.integers(-50_000, +50_000, size=len(dates) * len(tickers)).round(-2)
industries = Series(
data=rng.choice(
['energy', 'healthcare', 'tech', 'finance'],
size=len(tickers),
),
index=Index(tickers, name='ticker'),
name='industry',
).astype('category')
df = (
DataFrame({
'date': tile(dates, len(tickers)),
'ticker': repeat(tickers, len(dates)),
'price': prices,
'volumes': volumes,
})
.set_index(['date', 'ticker'])
.pipe(
lambda df: df
.join(
df['price']
.groupby('ticker').transform(lambda g: g > g.shift()).rename('signal')
)
)
)
with TemporaryDirectory() as d:
filename = Path(d) / 'data.feather'
write_feather(df, filename)
with open(filename, 'rb') as f:
print(f.read(100))
“Feather is a portable file format for storing Arrow tables or data frames (from languages like Python or R) that utilizes the Arrow IPC format internally. Feather was created early in the Arrow project as a proof of concept for fast, language-agnostic data frame storage for Python (pandas)”
from pandas import DataFrame, Series, Index, date_range
from numpy.random import default_rng
from string import ascii_lowercase
from numpy import repeat, tile
from pyarrow.feather import write_feather, read_feather
from tempfile import TemporaryDirectory
from pathlib import Path
rng = default_rng(0)
tickers = rng.choice([*ascii_lowercase], size=(10, 4)).view('<U4').ravel()
dates = date_range('2020-01-01', periods=15)
prices = (
100 * rng.random(size=len(tickers))
* rng.normal(loc=1, scale=0.01, size=(len(dates), len(tickers))).cumprod(axis=0)
).ravel().round(2)
volumes = rng.integers(-50_000, +50_000, size=len(dates) * len(tickers)).round(-2)
industries = Series(
data=rng.choice(
['energy', 'healthcare', 'tech', 'finance'],
size=len(tickers),
),
index=Index(tickers, name='ticker'),
name='industry',
).astype('category')
df = (
DataFrame({
'date': tile(dates, len(tickers)),
'ticker': repeat(tickers, len(dates)),
'price': prices,
'volumes': volumes,
})
.set_index(['date', 'ticker'])
.pipe(
lambda df: df
.join(
df['price']
.groupby('ticker').transform(lambda g: g > g.shift()).rename('signal')
)
)
)
with TemporaryDirectory() as d:
filename = Path(d) / 'data.feather'
write_feather(df, filename, compression='zstd')
before = df
after = read_feather(filename)
print(
before,
after,
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from pandas import Series, date_range, read_csv
from numpy.random import default_rng
from numpy import allclose
from pyarrow.feather import write_feather, read_feather
from tempfile import TemporaryDirectory
from pathlib import Path
from humanize import naturalsize
from _util import timed
rng = default_rng(0)
s = Series(
index=(idx := date_range('2000-01-01', freq='1T', periods=500_000)),
data=rng.random(size=len(idx)),
name='signal',
).rename_axis('date')
with TemporaryDirectory() as d:
filename = Path(d) / 'data.csv.gz'
with timed('.to_csv'):
s.to_csv(filename, compression='gzip')
before = s
with timed('read_csv'):
after = read_csv(
filename,
index_col=['date'],
parse_dates=['date'],
).squeeze()
assert (before.index == after.index).all() and allclose(before, after)
print(f'{naturalsize(filename.stat().st_size) = }')
# filename = Path(d) / 'data.feather'
# with timed('write_feather'):
# write_feather(s, filename)
filename = Path(d) / 'data.feather'
with timed('write_feather'):
write_feather(s.to_frame(), filename, compression='zstd')
before = s.to_frame()
with timed('read_feather'):
after = read_feather(filename)
assert (before.index == after.index).all() and (before.values == after.values).all()
print(f'{naturalsize(filename.stat().st_size) = }')
from pandas import period_range, DataFrame, MultiIndex, array
from numpy.random import default_rng
from string import ascii_lowercase
from pyarrow.feather import write_feather, read_feather
from tempfile import TemporaryDirectory
from pathlib import Path
rng = default_rng(0)
df = DataFrame(
index=(idx := MultiIndex.from_product([
['a', 'b', 'c'],
period_range('2020-01-01', freq='Q', periods=3),
], names=['entity', 'quarter'])),
data={
'a': rng.normal(size=len(idx)),
'b': rng.normal(size=len(idx)),
'c': array(
rng.choice([*ascii_lowercase], size=(len(idx), 4)).view('<U4').ravel()
).astype('category'),
},
)
with TemporaryDirectory() as d:
filename = Path(d) / 'data.feather'
before = df
write_feather(df, filename)
after = read_feather(filename)
print(
before.head(4),
after.head(4),
after.dtypes,
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from dataclasses import dataclass
from pandas import DataFrame
from pyarrow.feather import write_feather, read_feather
from tempfile import TemporaryDirectory
from pathlib import Path
@dataclass
class T:
a : int = 0
b : int = 0
df = DataFrame({
'x': [T(), T()],
'y': [T(), T()],
})
with TemporaryDirectory() as d:
filename = Path(d) / 'data.feather'
before = df
write_feather(df, filename)
after = read_feather(filename)
print(
before.head(4),
after.head(4),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
Feather Pros: · you may already be using Arrow (!) · relatively small file-size · interoperability (e.g., R, Rust) · fast
Feather Cons: · less general; harder cliff · not generally designed for archival use
from pandas import Series, DataFrame, date_range, MultiIndex
from numpy.random import default_rng
from pyarrow.feather import write_feather, read_feather
from io import BytesIO
from tempfile import TemporaryDirectory
from pathlib import Path
from tarfile import open as tf_open, TarInfo
rng = default_rng(0)
df = DataFrame(
index=(idx := MultiIndex.from_product([
['a', 'b', 'c'],
date_range('2020-01-01', periods=3),
], names=['entity', 'quarter'])),
data={
'signal': rng.normal(size=len(idx)),
'value': rng.normal(size=len(idx)),
},
)
s = Series(['x', 'y', 'z'], index=['a', 'b', 'c'])
print(
df,
s,
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
with TemporaryDirectory() as d:
filename = Path(d) / 'data.tar'
with tf_open(filename, 'w') as f:
write_feather(df, out := BytesIO(), compression='zstd')
f.addfile(TarInfo(name='df'), out)
write_feather(s.to_frame(), out := BytesIO(), compression='zstd')
f.addfile(TarInfo(name='s'), out)
from numpy.random import default_rng
from pyarrow.feather import write_feather
from tempfile import TemporaryDirectory
from pathlib import Path
rng = default_rng(0)
data = rng.normal(size=(3, 3, 3))
with TemporaryDirectory() as d:
filename = Path(d) / 'data.feather'
write_feather(data, filename)
What about Parquet?
“Apache Parquet is an open source, column-oriented data file format designed for efficient data storage and retrieval. It provides efficient data compression and encoding schemes with enhanced performance to handle complex data in bulk.”
from pandas import DataFrame, Series, Index, date_range
from numpy.random import default_rng
from string import ascii_lowercase
from numpy import repeat, tile
from tempfile import TemporaryDirectory
from pathlib import Path
rng = default_rng(0)
tickers = rng.choice([*ascii_lowercase], size=(10, 4)).view('<U4').ravel()
dates = date_range('2020-01-01', periods=15)
prices = (
100 * rng.random(size=len(tickers))
* rng.normal(loc=1, scale=0.01, size=(len(dates), len(tickers))).cumprod(axis=0)
).ravel().round(2)
volumes = rng.integers(-50_000, +50_000, size=len(dates) * len(tickers)).round(-2)
industries = Series(
data=rng.choice(
['energy', 'healthcare', 'tech', 'finance'],
size=len(tickers),
),
index=Index(tickers, name='ticker'),
name='industry',
).astype('category')
df = (
DataFrame({
'date': tile(dates, len(tickers)),
'ticker': repeat(tickers, len(dates)),
'price': prices,
'volumes': volumes,
})
.set_index(['date', 'ticker'])
.pipe(
lambda df: df
.join(
df['price']
.groupby('ticker').transform(lambda g: g > g.shift()).rename('signal')
)
)
)
with TemporaryDirectory() as d:
filename = Path(d) / 'data.parquet'
df.to_parquet(filename)
with open(filename, 'rb') as f:
print(f.read(100))
from pandas import Series, date_range, read_parquet
from numpy.random import default_rng
from numpy import allclose
from pyarrow.feather import write_feather, read_feather
from tempfile import TemporaryDirectory
from pathlib import Path
from humanize import naturalsize
from _util import timed
rng = default_rng(0)
s = Series(
index=(idx := date_range('2000-01-01', freq='1T', periods=500_000)),
data=rng.random(size=len(idx)),
name='signal',
).rename_axis('date')
with TemporaryDirectory() as d:
filename = Path(d) / 'data.feather'
with timed('write_feather'):
write_feather(s.to_frame(), filename, compression='zstd')
before = s.to_frame()
with timed('read_feather'):
after = read_feather(filename)
assert (before.index == after.index).all() and (before.values == after.values).all()
print(f'{naturalsize(filename.stat().st_size) = }')
filename = Path(d) / 'data.parquet'
with timed('.to_parquet'):
# s.to_frame().to_parquet(filename)
s.to_frame().to_parquet(filename, compression='brotli')
before = s.to_frame()
with timed('read_parquet'):
after = read_parquet(filename)
assert (before.index == after.index).all() and (before.values == after.values).all()
print(f'{naturalsize(filename.stat().st_size) = }')
from dataclasses import dataclass
from pandas import DataFrame
from tempfile import TemporaryDirectory
from pathlib import Path
@dataclass
class T:
a : int = 0
b : int = 0
df = DataFrame({
'x': [T(), T()],
'y': [T(), T()],
})
with TemporaryDirectory() as d:
filename = Path(d) / 'data.parquet'
df.to_parquet(filename)
print("Let's take a look!")
from pandas import DataFrame, Series, Index, date_range
from numpy.random import default_rng
from string import ascii_lowercase
from numpy import repeat, tile
from tempfile import TemporaryDirectory
from pathlib import Path
rng = default_rng(0)
tickers = rng.choice([*ascii_lowercase], size=(10, 4)).view('<U4').ravel()
dates = date_range('2020-01-01', periods=15)
prices = (
100 * rng.random(size=len(tickers))
* rng.normal(loc=1, scale=0.01, size=(len(dates), len(tickers))).cumprod(axis=0)
).ravel().round(2)
volumes = rng.integers(-50_000, +50_000, size=len(dates) * len(tickers)).round(-2)
industries = Series(
data=rng.choice(
['energy', 'healthcare', 'tech', 'finance'],
size=len(tickers),
),
index=Index(tickers, name='ticker'),
name='industry',
).astype('category')
df = (
DataFrame({
'date': tile(dates, len(tickers)),
'ticker': repeat(tickers, len(dates)),
'price': prices,
'volumes': volumes,
})
.set_index(['date', 'ticker'])
.pipe(
lambda df: df
.join(
df['price']
.groupby('ticker').transform(lambda g: g > g.shift()).rename('signal')
)
)
)
with TemporaryDirectory() as d:
filename = Path(d) / 'data.hdf'
df.to_hdf(filename, key='df')
with open(filename, 'rb') as f:
print(f.read(100))
from pandas import Series, date_range, read_parquet, read_hdf
from numpy.random import default_rng
from numpy import allclose
from pyarrow.feather import write_feather, read_feather
from tempfile import TemporaryDirectory
from pathlib import Path
from humanize import naturalsize
from _util import timed
rng = default_rng(0)
s = Series(
index=(idx := date_range('2000-01-01', freq='1T', periods=500_000)),
data=rng.random(size=len(idx)),
name='signal',
).rename_axis('date')
with TemporaryDirectory() as d:
filename = Path(d) / 'data.hdf'
with timed('.to_hdf'):
# s.to_hdf(filename, key='s')
s.to_hdf(filename, key='s', complib='blosc', complevel=9)
before = s
with timed('read_hdf'):
after = read_hdf(filename, key='s')
assert (before.index == after.index).all() and (before.values == after.values).all()
print(f'{naturalsize(filename.stat().st_size) = }')
filename = Path(d) / 'data.feather'
with timed('write_feather'):
write_feather(s.to_frame(), filename, compression='zstd')
before = s.to_frame()
with timed('read_feather'):
after = read_feather(filename)
assert (before.index == after.index).all() and (before.values == after.values).all()
print(f'{naturalsize(filename.stat().st_size) = }')
filename = Path(d) / 'data.parquet'
with timed('.to_parquet'):
s.to_frame().to_parquet(filename, compression='brotli')
before = s.to_frame()
with timed('read_parquet'):
after = read_parquet(filename)
assert (before.index == after.index).all() and (before.values == after.values).all()
print(f'{naturalsize(filename.stat().st_size) = }')
from dataclasses import dataclass
from pandas import DataFrame
from tempfile import TemporaryDirectory
from pathlib import Path
@dataclass
class T:
a : int = 0
b : int = 0
df = DataFrame({
'x': [T(), T()],
'y': [T(), T()],
})
with TemporaryDirectory() as d:
filename = Path(d) / 'data.hdf'
df.to_hdf(filename, key='df', complib='blosc', complevel=9)
from dataclasses import dataclass
from pandas import DataFrame, read_hdf
from tempfile import TemporaryDirectory
from pathlib import Path
from warnings import catch_warnings, filterwarnings
@dataclass
class T:
a : int = 0
b : int = 0
df = DataFrame({
'x': [T(), T()],
'y': [T(), T()],
})
with TemporaryDirectory() as d:
filename = Path(d) / 'data.hdf'
with catch_warnings():
filterwarnings('ignore')
df.to_hdf(filename, key='df', complib='blosc', complevel=9)
before = df
after = read_hdf(filename, key='df')
print(
before,
after,
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from pandas import Series, DataFrame, date_range, MultiIndex, read_hdf
from numpy.random import default_rng
from tempfile import TemporaryDirectory
from pathlib import Path
from tables import open_file
rng = default_rng(0)
df = DataFrame(
index=(idx := MultiIndex.from_product([
['a', 'b', 'c'],
date_range('2020-01-01', periods=3),
], names=['entity', 'date'])),
data={
'signal': rng.normal(size=len(idx)),
'value': rng.normal(size=len(idx)),
},
)
s = Series(['x', 'y', 'z'], index=['a', 'b', 'c'])
print(
df,
s,
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
with TemporaryDirectory() as d:
filename = Path(d) / 'data.hdf'
df.to_hdf(filename, key='df', complib='blosc', complevel=9)
s.to_hdf(filename, key='s', complib='blosc', complevel=9)
from numpy.random import default_rng
from tables import open_file
from tempfile import TemporaryDirectory
from pathlib import Path
rng = default_rng(0)
xs = rng.normal(size=(3, 3, 3))
with TemporaryDirectory() as d:
filename = Path(d) / 'data.hdf'
with open_file(filename, 'a') as f:
f.create_array(f.root, 'xs', xs)
HDF Pros:
· comparable sizes
· full filesystem
· relatively general (can store n-dimensional data; defaults to pickle for arbitrary data)
HDF Cons: · relatively slow
from pandas import DataFrame, MultiIndex, date_range, read_parquet, read_hdf
from numpy.random import default_rng
from pyarrow.feather import write_feather, read_feather
from tempfile import TemporaryDirectory
from pathlib import Path
rng = default_rng(0)
df = DataFrame(
index=(idx := MultiIndex.from_product([
['a', 'b', 'c'],
date_range('2020-01-01', periods=3),
], names=['entity', 'date'])),
data={
'x': rng.random(size=len(idx)),
'y': rng.random(size=len(idx)),
}
)
df.attrs = {'metadata': 'some metadata'}
df = df.groupby('date').sum()
with TemporaryDirectory() as d:
filename = Path(d) / 'data.fea'
write_feather(df, filename)
after = read_feather(filename)
print(f'Feather: {after.attrs = }')
filename = Path(d) / 'data.pkl'
df.to_parquet(filename)
after = read_parquet(filename)
print(f'Parquet: {after.attrs = }')
filename = Path(d) / 'data.hdf'
df.to_hdf(filename, key='df')
after = read_hdf(filename, key='df')
print(f'HDF: {after.attrs = }')
from numpy.random import default_rng
from h5py import File
from tempfile import TemporaryDirectory
from pathlib import Path
rng = default_rng(0)
with TemporaryDirectory() as d:
filename = Path(d) / 'data.hdf'
with File(filename, 'a') as f:
f['data'] = ...
print("Let's take a look!")
from numpy.random import default_rng
rng = default_rng(0)
xs = rng.normal(size=(3, 3, 3))
ys = rng.normal(size=(2, 1, 1, 3))
print(
xs,
ys,
# xs * ys,
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from numpy.random import default_rng
from xarray import DataArray
rng = default_rng(0)
xs = DataArray(
data=rng.normal(size=(3, 3, 3)),
dims=[*'abc'],
)
ys = DataArray(
data=rng.normal(size=(2, 1, 1, 3)),
dims=[*'defc'],
)
print(
xs,
ys,
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from numpy.random import default_rng
from xarray import DataArray, Dataset
rng = default_rng(0)
xs = DataArray(
data=rng.normal(size=(3, 3, 3)),
dims=[*'abc'],
)
ys = DataArray(
data=rng.normal(size=(2, 1, 1, 3)),
dims=[*'defc'],
)
ds = Dataset({'xs': xs, 'ys': ys})
print(
ds,
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from numpy.random import default_rng
from xarray import DataArray, Dataset
from tempfile import TemporaryDirectory
from pathlib import Path
rng = default_rng(0)
xs = DataArray(
data=rng.normal(size=(3, 3, 3)),
dims=[*'abc'],
)
ys = DataArray(
data=rng.normal(size=(2, 1, 1, 3)),
dims=[*'defc'],
)
ds = Dataset({'xs': xs, 'ys': ys})
with TemporaryDirectory() as d:
filename = Path(d) / 'data.cdf'
ds.to_netcdf(filename)
from zarr import open as zarr_open, group
from numcodecs import Blosc
from tempfile import TemporaryDirectory
from pathlib import Path
from numpy.random import default_rng
rng = default_rng(0)
with TemporaryDirectory() as d:
filename = Path(d) / 'data.zarr'
z = zarr_open(
filename,
mode='w',
shape=(sh := (10_000, 10_000)),
chunks=(1_000, 1_000),
dtype=(dt := 'uint8'),
compressor=Blosc(
clevel=9,
),
)
z[:] = rng.integers(0, 255, size=sh, dtype=dt)
print(z.info)
from pandas import DataFrame, Series, Index, date_range
from numpy.random import default_rng
from string import ascii_lowercase
from numpy import repeat, tile
from tempfile import TemporaryDirectory
from pathlib import Path
from sqlite3 import connect
rng = default_rng(0)
tickers = rng.choice([*ascii_lowercase], size=(10, 4)).view('<U4').ravel()
dates = date_range('2020-01-01', periods=15)
prices = (
100 * rng.random(size=len(tickers))
* rng.normal(loc=1, scale=0.01, size=(len(dates), len(tickers))).cumprod(axis=0)
).ravel().round(2)
volumes = rng.integers(-50_000, +50_000, size=len(dates) * len(tickers)).round(-2)
industries = Series(
data=rng.choice(
['energy', 'healthcare', 'tech', 'finance'],
size=len(tickers),
),
index=Index(tickers, name='ticker'),
name='industry',
).astype('category')
df = DataFrame({
'date': tile(dates, len(tickers)),
'ticker': repeat(tickers, len(dates)),
'price': prices,
'volumes': volumes,
}).set_index(['date', 'ticker']).join(industries)
df['signal'] = df.groupby('ticker')['price'].transform(lambda g: g > g.shift())
with TemporaryDirectory() as d:
filename = Path(d) / 'data.sql'
with connect(filename) as db:
df.to_sql('df', db)
with connect(filename) as db:
print([*db.execute('select * from df')])