ts-python

Python @ Two Sigma (seminar)

`pandas` is well-designed, actually

Theme: applied pandas

Topic: how the pieces of pandas “fit” together

Presenter	James Powell james@dutc.io
Date	Friday, November 6, 2020
Time	9:30 AM PST

print('Welcome!')
print('Welcome!')
print('Welcome!')

Python systems have two programmatic domains: · programme structuring · computational

In the programme structuring domain, we make use of tools like the Python built-in types (e.g., list, dict, set, &c.) as well as Python metaphors (e.g., context managers for sequenced operations, decorators for wrapped behaviours, generators for lazy computation or for explicit lexical linearisation of implicit state-graphs, &c.)

Take a look at the Python list…

xs = [1, 2, 3, 4, None, 'abcd']
ys = [1, 2, 3, 4]

print(f'{xs = }')
print(f'{ys = }')

# this is not a computational type,
#   otherwise, these operations
#   would be interpreted computationally

xs = [1, 2, 3, 4]
print(f'{xs * 2  = }') # "structure"-level operation
                       # → repetition

ys = [5, 6, 7, 8]
print(f'{xs + ys = }') # "structure"-level operation
                       # → concatenation

zs = [x ** 2 for x in xs]
print(f'{zs = }')

# the contents of a Python built-in type are
#   references to boxed types

xs = [x ** 20_000 for x in range(4)] # NOTE: automatic promotion
                                     #       `int` is NOT machine-typed
                                     #             does NOT have fixed bit-width
print(f'{xs = }')

# as a consequence, operations
#   within these structures
#   are hard to optimised; thus, slow

from utils import timed
from random import randint

dot = lambda xs, ys: sum(x * y for x, y in zip(xs, ys))

with timed('creating `list` with comprehension syntax'):
    xs = [randint(-100, 100) for _ in range(10_000_000)]
    ys = [randint(-100, 100) for _ in range(10_000_000)]

with timed('`list`: dot product of xs and ys'):
    dot(xs, ys)

# in comparison, `numpy` provides us with a 
#   computational type
# this type is a "manager class" that fully controls
#   its contents
# thus, it is free to store its contents in an optimal fashion
# thus, it is able to enforce constraints on its contents

# → "restricted computational domain"

from utils import timed
from numpy.random import randint

with timed('creating `numpy.ndarray`'):
    xs = randint(-100, 100, size=10_000_000)
    ys = randint(-100, 100, size=10_000_000)

dot = lambda xs, ys: sum(x * y for x, y in zip(xs, ys))

with timed('`numpy.ndarray`: dot product of xs and ys'):
    xs.dot(ys)

# additionally, the `numpy.ndarray` is clearly
#   a computational type
from numpy import array

xs = array([1, 2, 3, 4])
print(f'{xs       = }')
print(f'{xs.dtype = }') # NOTE: the contents are machine-typed
                        #       (i.e., fixed bit-width int64)
print(f'{xs * 2   = }') # NOTE: elementwise operation

ys = array([5, 6, 7, 8])
print(f'{xs + ys  = }') # NOTE: elementwise operation

from numpy import array
xs = array([object(), object()])
print(f'{xs = }')

# the `numpy.ndarray` array is actually
#   just a view of some memory region
from numpy import array

xs = array([1, 2, 3, 4])
print(f'{xs                                = }')
print(f'{xs.__array_interface__["data"][0] = :#_x}') # memory location
print(f'{xs.dtype                          = }') # interpreted type
print(f'{xs.shape                          = }') # interpreted shape
print(f'{xs.strides                        = }') # interpreted strides

s = '123'
i = int(s)

from numpy import array
xs = array([1, 2, 3, 4])

(ys := xs.copy()).shape = 1, 4
print(f'{xs = }')
print(f'{ys = }')

from numpy import array
xs = array([1, 2, 3, 4])

(ys := xs.copy()).shape = 2, 2
print(f'{xs = }')
print(f'{ys = }')

from numpy import array
xs = array([1, 2, 3, 4])

from numpy.lib.stride_tricks import as_strided
ys = as_strided(xs, strides=(16, 8), shape=(2, 2))
print(f'{xs = }')
print(f'{ys = }')

from numpy import set_printoptions
set_printoptions(linewidth=float('inf'), threshold=24)

from numpy import array
xs = array([1, 2, 3, 4]) * 100

(ys := xs.copy()).dtype = 'int8'
print('.dtype = …'.center(20, '-'))
print(f'{xs = }')
print(f'{ys = }')

# NOTE: distinct from `.astype`!
#       cast vs convert
print('', '.astype(…)'.center(20, '-'), sep='\n')
ys = xs.copy().astype('int8')
print(f'{xs = }')
print(f'{ys = }')

# thus, analysing memory usage of `numpy` code
#   is very easy
from numpy import shares_memory
from numpy.random import normal

xs = normal(size=100_000)
print(f'{xs.nbytes = :,}') # total number of bytes

xs = normal(size=100_000).astype('float32')
print(f'{xs.nbytes = :,}') # total number of bytes

ys = xs
print(f'{shares_memory(xs, ys) = }')

ys = xs.copy()
print(f'{shares_memory(xs, ys) = }')

However, numpy as a pure computational type has limits:

its ergonomics aren’t great (we may prefer a ‘named tensor’ like xarray.DataArray)
it lacks business-level information, such as missing data
it lacks business-level information, such as querying/filtering mechanisms

from numpy import array

xs = array([1, 2, 3, None])
print(f'{xs       = }')
print(f'{xs.dtype = }')

xs = array([1_000_000, 2, 3, float('nan')])
print(f'{xs       = }')
print(f'{xs.dtype = }')

from utils import printf
from pandas import array

xs = array([1, 2, 3, None])
printf('xs')
print(f'{xs.dtype = }')
print(f'{xs._data = }')
print(f'{xs._mask = }')

from utils import printf
from pandas import array, Categorical
from numpy import shares_memory
from numpy.random import choice
from string import ascii_lowercase

ws = choice([*ascii_lowercase], size=(10, 4)).view('<U4').ravel()
print(f'{ws.nbytes = }')

xs = array(ws)
printf('xs')

ys = array(choice([*ascii_lowercase], size=(100_000, 4)).view('<U4').ravel())
print(f'{xs.memory_usage(deep=True) = :,}')
print(f'{ys.memory_usage(deep=True) = :,}')

print(f'{xs._ndarray.nbytes = :,}')
print(f'{ys._ndarray.nbytes = :,}')

print(f'{xs._ndarray.dtype = :}')
print(f'{ys._ndarray.dtype = :}')

print(f'{shares_memory(xs._ndarray, ws) = }')
print(f'{xs._ndarray is ws              = }')

zs = Categorical(ys)
print(f'{ys.memory_usage(deep=True) = :,}')
print(f'{zs.memory_usage(deep=True) = :,}')

from pandas import Series
from numpy.random import normal

s = Series(xs := normal(size=100_000), name='nums')
print(f'{s.array._ndarray is xs = }')

from utils import printf
from pandas import Series
from numpy.random import normal

s = Series(xs := normal(size=100_000), name='nums')
printf('s')

printf('s[s > 2]')
print(f'{len(s[s > 2]) / len(s) = }')

printf('s[(s >= 2) | (s <= -2)]')
printf('xs[(xs >= 2) | (xs <= -2)]')

printf('s[lambda s: s >= 2]')
#  printf('xs[lambda xs: s >= 2]') # NotImplemented

print(f'{s.sum() = }')
print(f'{s.var() = }')
print(f'{s.std() = }')
print(f'{xs.sum() = }')
print(f'{xs.var() = }')
print(f'{xs.std() = }')

print(f'{s.shift() = }')
print(f'{s.diff() = }')
#  print(f'{xs.shift() = }') # NotImplemented
#  print(f'{xs.diff() = }') # NotImplemented

Alternative lookup modalities.

pandas.Series provides a structure with two lookup modes:

by positional index
by labelled index

from utils import printf
from pandas import Series
from numpy import arange
from string import ascii_lowercase

s = Series(arange(size := 6), index=[*ascii_lowercase[:size]], name='nums')
printf('s')

print(f'{s.iloc[0]  = }')
print(f'{s.loc["a"] = }')

printf('s.iloc[0:2]')
printf('s.loc["a":"c"]') # NOTE: does not use half-open interval notation!
                         #       uses open interval; i.e., [start, stop]

from utils import printf
from pandas import Series, MultiIndex
from numpy import arange
from string import ascii_lowercase
from itertools import islice, cycle

s = Series(arange(size := 6), name='nums')
printf('s')

index0 = 'ab'
assert size % len(index0) == 0
index1 = lambda off: islice(cycle(ascii_lowercase), off, off + (size // len(index0)))

s.index = MultiIndex.from_tuples((x, y) for x in index0 for y in index1(ord(x)))
printf('s')

printf('s.loc["a"]')
print(f'{s.loc["a", "t"] = }')

from utils import printf
from pandas import Series, to_datetime
from numpy import arange, array
from string import ascii_lowercase
from datetime import datetime, timedelta
from random import randrange

s = Series(arange(size := 6), name='nums')
s.index = to_datetime(array([timedelta(seconds=randrange(1, 60)) for _ in range(size)]).cumsum() + datetime.now().replace(hour=9, minute=0, second=0, microsecond=0))
s.index = to_datetime(['2020-11-06 09:00', *s.index[1:]])
printf('s')

#  printf('s.loc["2020-11-06"]')
#
#  printf('s.loc["2020-11"]')
#
#  printf('s.loc["2020-11-06 09:00:04"]')

from utils import printf
from pandas import Series, to_datetime
from numpy import arange, array
from string import ascii_lowercase
from datetime import datetime, timedelta
from random import randrange
from numpy.random import permutation

s = Series(arange(size := 6), name='nums')
s.index = to_datetime(array([timedelta(seconds=randrange(1, 60)) for _ in range(size)]).cumsum() + datetime.now().replace(hour=9, minute=0, second=0, microsecond=0))
printf('s')

print(f'{s.index.is_monotonic = }')

s.index = permutation(s.index)
printf('s')
print(f'{s.index.is_monotonic = }')

A pandas.Series is a one-dimenaional structure with an index, looking like…

# index    data
#   .       x
#   .       x
#   .       x
#   .       x

A pandas.DataFrame is a two-dimenaional structure with a major and minor index, looking like…

#       "column"  index
#           a       b
# "row"
# index    data   data
#   .       x       x
#   .       x       x
#   .       x       x
#   .       x       x

from utils import printf
from pandas import DataFrame, to_datetime
from numpy import arange, array
from string import ascii_lowercase
from datetime import datetime, timedelta
from random import randrange
from numpy.random import permutation

df = DataFrame({
    'a': arange(size := 6),
    'b': arange(10, 10+size),
    'c': arange(100, 100+size),
})
df.index = to_datetime(array([timedelta(seconds=randrange(1, 60)) for _ in range(size)]).cumsum() + datetime.now().replace(hour=9, minute=0, second=0, microsecond=0))
printf('df')

#  df = df.transpose()
#  printf('df')

A quick guide to pandas.DataFrame operations:

#       "column"  index
#           a       b
# "row"
# index    data   data
#   .       x       x
#   .       x       x
#   .       x       x
#   .       x       x

# df.x
# df[x]             → look-up, using "column" index, labelled

# df[df.columns[x]] → look-up, using "column" index, positional

# df.iloc[x]        → look-up, using "row" index, positional
# df.loc[x]         → look-up, using "row" index, labelled

# df.groupby(…, axis=0) → aggregate data along "rows" using "row" index
# df.groupby(…, axis=1) → aggregate data along "columns" using "column" index

# df.resample(…)        → perform N:M mapping of data along "rows" using "row" index

# df.stack       → turn "column" index into a "row" index
# df.unstack     → turn "row" index into a "column" index
# df.melt        → take data along "rows" with corresponding "column" index values
#                  and turn into new columns
# df.pivot       → pivot data along "rows" into new columns
# df.pivot_table → perform .groupby and .unstack with finger control

s[(s > 2) | (s < 3)]
s[s > 2]
s[lambda s: ...]

ts-python

Python @ Two Sigma (seminar)

pandas is well-designed, actually

`pandas` is well-designed, actually