pandas is well-designed, actuallyTheme: applied pandas
Topic: how the pieces of pandas “fit” together
| Presenter | James Powell james@dutc.io |
| Date | Friday, November 6, 2020 |
| Time | 9:30 AM PST |
print('Welcome!')
print('Welcome!')
print('Welcome!')
Python systems have two programmatic domains: · programme structuring · computational
In the programme structuring domain, we make use of tools like the Python built-in types (e.g., list, dict, set, &c.) as well as Python metaphors (e.g., context managers for sequenced operations, decorators for wrapped behaviours, generators for lazy computation or for explicit lexical linearisation of implicit state-graphs, &c.)
Take a look at the Python list…
xs = [1, 2, 3, 4, None, 'abcd']
ys = [1, 2, 3, 4]
print(f'{xs = }')
print(f'{ys = }')
# this is not a computational type,
# otherwise, these operations
# would be interpreted computationally
xs = [1, 2, 3, 4]
print(f'{xs * 2 = }') # "structure"-level operation
# → repetition
ys = [5, 6, 7, 8]
print(f'{xs + ys = }') # "structure"-level operation
# → concatenation
zs = [x ** 2 for x in xs]
print(f'{zs = }')
# the contents of a Python built-in type are
# references to boxed types
xs = [x ** 20_000 for x in range(4)] # NOTE: automatic promotion
# `int` is NOT machine-typed
# does NOT have fixed bit-width
print(f'{xs = }')
# as a consequence, operations
# within these structures
# are hard to optimised; thus, slow
from utils import timed
from random import randint
dot = lambda xs, ys: sum(x * y for x, y in zip(xs, ys))
with timed('creating `list` with comprehension syntax'):
xs = [randint(-100, 100) for _ in range(10_000_000)]
ys = [randint(-100, 100) for _ in range(10_000_000)]
with timed('`list`: dot product of xs and ys'):
dot(xs, ys)
# in comparison, `numpy` provides us with a
# computational type
# this type is a "manager class" that fully controls
# its contents
# thus, it is free to store its contents in an optimal fashion
# thus, it is able to enforce constraints on its contents
# → "restricted computational domain"
from utils import timed
from numpy.random import randint
with timed('creating `numpy.ndarray`'):
xs = randint(-100, 100, size=10_000_000)
ys = randint(-100, 100, size=10_000_000)
dot = lambda xs, ys: sum(x * y for x, y in zip(xs, ys))
with timed('`numpy.ndarray`: dot product of xs and ys'):
xs.dot(ys)
# additionally, the `numpy.ndarray` is clearly
# a computational type
from numpy import array
xs = array([1, 2, 3, 4])
print(f'{xs = }')
print(f'{xs.dtype = }') # NOTE: the contents are machine-typed
# (i.e., fixed bit-width int64)
print(f'{xs * 2 = }') # NOTE: elementwise operation
ys = array([5, 6, 7, 8])
print(f'{xs + ys = }') # NOTE: elementwise operation
from numpy import array
xs = array([object(), object()])
print(f'{xs = }')
# the `numpy.ndarray` array is actually
# just a view of some memory region
from numpy import array
xs = array([1, 2, 3, 4])
print(f'{xs = }')
print(f'{xs.__array_interface__["data"][0] = :#_x}') # memory location
print(f'{xs.dtype = }') # interpreted type
print(f'{xs.shape = }') # interpreted shape
print(f'{xs.strides = }') # interpreted strides
s = '123'
i = int(s)
from numpy import array
xs = array([1, 2, 3, 4])
(ys := xs.copy()).shape = 1, 4
print(f'{xs = }')
print(f'{ys = }')
from numpy import array
xs = array([1, 2, 3, 4])
(ys := xs.copy()).shape = 2, 2
print(f'{xs = }')
print(f'{ys = }')
from numpy import array
xs = array([1, 2, 3, 4])
from numpy.lib.stride_tricks import as_strided
ys = as_strided(xs, strides=(16, 8), shape=(2, 2))
print(f'{xs = }')
print(f'{ys = }')
from numpy import set_printoptions
set_printoptions(linewidth=float('inf'), threshold=24)
from numpy import array
xs = array([1, 2, 3, 4]) * 100
(ys := xs.copy()).dtype = 'int8'
print('.dtype = …'.center(20, '-'))
print(f'{xs = }')
print(f'{ys = }')
# NOTE: distinct from `.astype`!
# cast vs convert
print('', '.astype(…)'.center(20, '-'), sep='\n')
ys = xs.copy().astype('int8')
print(f'{xs = }')
print(f'{ys = }')
# thus, analysing memory usage of `numpy` code
# is very easy
from numpy import shares_memory
from numpy.random import normal
xs = normal(size=100_000)
print(f'{xs.nbytes = :,}') # total number of bytes
xs = normal(size=100_000).astype('float32')
print(f'{xs.nbytes = :,}') # total number of bytes
ys = xs
print(f'{shares_memory(xs, ys) = }')
ys = xs.copy()
print(f'{shares_memory(xs, ys) = }')
However, numpy as a pure computational type has limits:
xarray.DataArray)from numpy import array
xs = array([1, 2, 3, None])
print(f'{xs = }')
print(f'{xs.dtype = }')
xs = array([1_000_000, 2, 3, float('nan')])
print(f'{xs = }')
print(f'{xs.dtype = }')
from utils import printf
from pandas import array
xs = array([1, 2, 3, None])
printf('xs')
print(f'{xs.dtype = }')
print(f'{xs._data = }')
print(f'{xs._mask = }')
from utils import printf
from pandas import array, Categorical
from numpy import shares_memory
from numpy.random import choice
from string import ascii_lowercase
ws = choice([*ascii_lowercase], size=(10, 4)).view('<U4').ravel()
print(f'{ws.nbytes = }')
xs = array(ws)
printf('xs')
ys = array(choice([*ascii_lowercase], size=(100_000, 4)).view('<U4').ravel())
print(f'{xs.memory_usage(deep=True) = :,}')
print(f'{ys.memory_usage(deep=True) = :,}')
print(f'{xs._ndarray.nbytes = :,}')
print(f'{ys._ndarray.nbytes = :,}')
print(f'{xs._ndarray.dtype = :}')
print(f'{ys._ndarray.dtype = :}')
print(f'{shares_memory(xs._ndarray, ws) = }')
print(f'{xs._ndarray is ws = }')
zs = Categorical(ys)
print(f'{ys.memory_usage(deep=True) = :,}')
print(f'{zs.memory_usage(deep=True) = :,}')
from pandas import Series
from numpy.random import normal
s = Series(xs := normal(size=100_000), name='nums')
print(f'{s.array._ndarray is xs = }')
from utils import printf
from pandas import Series
from numpy.random import normal
s = Series(xs := normal(size=100_000), name='nums')
printf('s')
printf('s[s > 2]')
print(f'{len(s[s > 2]) / len(s) = }')
printf('s[(s >= 2) | (s <= -2)]')
printf('xs[(xs >= 2) | (xs <= -2)]')
printf('s[lambda s: s >= 2]')
# printf('xs[lambda xs: s >= 2]') # NotImplemented
print(f'{s.sum() = }')
print(f'{s.var() = }')
print(f'{s.std() = }')
print(f'{xs.sum() = }')
print(f'{xs.var() = }')
print(f'{xs.std() = }')
print(f'{s.shift() = }')
print(f'{s.diff() = }')
# print(f'{xs.shift() = }') # NotImplemented
# print(f'{xs.diff() = }') # NotImplemented
Alternative lookup modalities.
pandas.Series provides a structure with two lookup modes:
from utils import printf
from pandas import Series
from numpy import arange
from string import ascii_lowercase
s = Series(arange(size := 6), index=[*ascii_lowercase[:size]], name='nums')
printf('s')
print(f'{s.iloc[0] = }')
print(f'{s.loc["a"] = }')
printf('s.iloc[0:2]')
printf('s.loc["a":"c"]') # NOTE: does not use half-open interval notation!
# uses open interval; i.e., [start, stop]
from utils import printf
from pandas import Series, MultiIndex
from numpy import arange
from string import ascii_lowercase
from itertools import islice, cycle
s = Series(arange(size := 6), name='nums')
printf('s')
index0 = 'ab'
assert size % len(index0) == 0
index1 = lambda off: islice(cycle(ascii_lowercase), off, off + (size // len(index0)))
s.index = MultiIndex.from_tuples((x, y) for x in index0 for y in index1(ord(x)))
printf('s')
printf('s.loc["a"]')
print(f'{s.loc["a", "t"] = }')
from utils import printf
from pandas import Series, to_datetime
from numpy import arange, array
from string import ascii_lowercase
from datetime import datetime, timedelta
from random import randrange
s = Series(arange(size := 6), name='nums')
s.index = to_datetime(array([timedelta(seconds=randrange(1, 60)) for _ in range(size)]).cumsum() + datetime.now().replace(hour=9, minute=0, second=0, microsecond=0))
s.index = to_datetime(['2020-11-06 09:00', *s.index[1:]])
printf('s')
# printf('s.loc["2020-11-06"]')
#
# printf('s.loc["2020-11"]')
#
# printf('s.loc["2020-11-06 09:00:04"]')
from utils import printf
from pandas import Series, to_datetime
from numpy import arange, array
from string import ascii_lowercase
from datetime import datetime, timedelta
from random import randrange
from numpy.random import permutation
s = Series(arange(size := 6), name='nums')
s.index = to_datetime(array([timedelta(seconds=randrange(1, 60)) for _ in range(size)]).cumsum() + datetime.now().replace(hour=9, minute=0, second=0, microsecond=0))
printf('s')
print(f'{s.index.is_monotonic = }')
s.index = permutation(s.index)
printf('s')
print(f'{s.index.is_monotonic = }')
A pandas.Series is a one-dimenaional structure with an index, looking like…
# index data
# . x
# . x
# . x
# . x
A pandas.DataFrame is a two-dimenaional structure with a major and minor index, looking like…
# "column" index
# a b
# "row"
# index data data
# . x x
# . x x
# . x x
# . x x
from utils import printf
from pandas import DataFrame, to_datetime
from numpy import arange, array
from string import ascii_lowercase
from datetime import datetime, timedelta
from random import randrange
from numpy.random import permutation
df = DataFrame({
'a': arange(size := 6),
'b': arange(10, 10+size),
'c': arange(100, 100+size),
})
df.index = to_datetime(array([timedelta(seconds=randrange(1, 60)) for _ in range(size)]).cumsum() + datetime.now().replace(hour=9, minute=0, second=0, microsecond=0))
printf('df')
# df = df.transpose()
# printf('df')
A quick guide to pandas.DataFrame operations:
# "column" index
# a b
# "row"
# index data data
# . x x
# . x x
# . x x
# . x x
# df.x
# df[x] → look-up, using "column" index, labelled
# df[df.columns[x]] → look-up, using "column" index, positional
# df.iloc[x] → look-up, using "row" index, positional
# df.loc[x] → look-up, using "row" index, labelled
# df.groupby(…, axis=0) → aggregate data along "rows" using "row" index
# df.groupby(…, axis=1) → aggregate data along "columns" using "column" index
# df.resample(…) → perform N:M mapping of data along "rows" using "row" index
# df.stack → turn "column" index into a "row" index
# df.unstack → turn "row" index into a "column" index
# df.melt → take data along "rows" with corresponding "column" index values
# and turn into new columns
# df.pivot → pivot data along "rows" into new columns
# df.pivot_table → perform .groupby and .unstack with finger control
s[(s > 2) | (s < 3)]
s[s > 2]
s[lambda s: ...]