Donkey (Eddie Murphy): Hey, what’s your problem, Shrek, what you got against the whole world anyway, huh?
Shrek (Mike Myers): Look, I’m not the one with the problem, okay? It’s the world that seems to have a problem with ME! People take one look at me and go “Aargh! Help! Run! A big stupid ugly ogre!” They judge me before they even know me - that’s why I’m better off alone…
Donkey (Eddie Murphy): You know, Shrek… when we first met, I didn’t think you were a big, stupid, ugly ogre.
Shrek (Mike Myers): Yeah, I know.
— Shrek (2001)
| Date | Time | Track | Meeting Link |
|---|---|---|---|
| Fri, Dec 3, 2021 | 9:30 AM EDT | Python fundmentals & misc. | https://primetime.bluejeans.com/a2m/live-event/xcsugkge |
These sessions are designed for a broad audience of modelers and software programmers of all backgrounds and skill levels.
Our expected audience should comprise attendees…
pandas.During this session, we will endeavour to guide our audience to developing…
…and we will share additional tips, tricks, and in-depth guidance on all of these topics!
In this special holiday episode, we’re going to have some fun! We’ll discuss
design mistakes (“warts”) made in the development of Python and common analysis
libraries like pandas. We’ll cover why these exist, how to come to terms with
them, how they affect the design and implementation of our own systems and
analytical efforts, and what these tell us more broadly about Python and
programming.
Did you enjoy this episode? Did you learn something new that will help you as you continue or begin to use window methods in your work?
If so, stay tuned for future episodes, which may…
pandas and xarray, how these may affect your work, and common workarounds used to address these.Let’s get started!
print("Let's get started!")
stringing me along!from random import choice as py_choice
from numpy.random import choice as np_choice
from string import ascii_lowercase
print(
ascii_lowercase,
py_choice(ascii_lowercase),
# np_choice(ascii_lowercase),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
Why does this matter?
from pandas import Series
xs = array(dtype='int64')
print(
xs,
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from pandas import Series
from operator import or_
s1 = Series([{1, 2}, {2, 3, 4}, {3, 4, 5}])
s2 = Series([{2, 3}, {1, 2, 4}, {1, 4, 5}])
print(
s1.combine(s2, or_),
# s1.explode().append(s2.explode()),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
What else can you show me?
from pandas import Series, DataFrame
from numpy.random import default_rng
string up trouble!from numpy import array
from pandas import Series
xs = array(['abc', 'def', 'wxyz'])
s = Series(['abc', 'def', 'wxyz'])
print(
xs, f'{xs.dtype = }',
s, f'{s.dtype = }',
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
Why does this matter?
from pandas import Series
s = Series(['abc', 'def', 'wxyz']).astype('category')
print(
s, f'{s.dtype = }',
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from contextlib import contextmanager
from time import perf_counter
@contextmanager
def timed(msg):
start = perf_counter()
try:
yield
finally:
stop = perf_counter()
print(f'{msg:<16} \N{mathematical bold capital delta}t: {stop - start:.6f}s')
from pandas import Series
from numpy.random import default_rng
from string import ascii_lowercase
rng = default_rng(0)
s1 = Series(
rng.choice([*ascii_lowercase], size=(100_000, (width := 4))).view(f'<U{width}').ravel()
)
s2 = s1.astype('category')
print(
s1, f'{s1.dtype = }',
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
# with timed('object'):
# '.' == s1
# with timed('category'):
# '.' == s2
from pandas import DataFrame
from numpy.random import default_rng
from string import ascii_lowercase
rng = default_rng(0)
df = DataFrame({
'a': rng.integers(10, size=(size := 4)),
'b': rng.random(size=size).round(2),
'c': rng.choice([*ascii_lowercase], size=size),
})
print(
df,
0 in df['a'],
'n' in df['c'],
0 in df['c'],
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
Why does this matter?
.index!Rules of pandas:
Rules of pandas:
from pandas import Series, MultiIndex, date_range, IndexSlice
from numpy.random import default_rng
from string import ascii_lowercase
rng = default_rng(0)
dates = date_range('2000-01-01', periods=100)
assets = rng.choice([*ascii_lowercase], size=(10, 4)).view('<U4').ravel()
underlyings = rng.choice([*ascii_lowercase], size=(16, 2)).view('<U2').ravel()
idx = MultiIndex.from_product([
dates, assets, underlyings
], names=['date', 'asset', 'underlying'])
s = (
Series(rng.normal(size=len(idx)).round(2), index=idx)
.sample(frac=.20, random_state=rng.bit_generator)
.sort_index()
)
print(
s.loc[IndexSlice[(dts := date_range('2000-01-01', '2000-01-31')), :]]
.groupby('asset').agg(lambda g:
((g[g > 0]).groupby('date').size() > 0)
.reindex(dates, fill_value=False)
.pipe(lambda s: s.groupby((~s).cumsum()).cumcount())
.max()
),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from pandas import Series, date_range
from numpy.random import default_rng
rng = default_rng(0)
idx = date_range('2000-01-01', periods=150, freq='1D')
s = Series(rng.normal(size=len(idx)).round(2), index=idx)
print(
s.iloc[0:20],
s.iloc[0:20].reset_index(drop=True),
s.loc['2000-01-01':'2000-01-31'],
s.loc['2000-01-01':'2000-01-31'].iloc[:-1],
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
Why does this matter?
.index, did you forget about it?from pandas import Series, date_range
from numpy.random import default_rng
from string import ascii_lowercase
rng = default_rng(0)
idx = [*ascii_lowercase]
# idx = [*ascii_lowercase[::2]]
s = Series(rng.normal(size=len(idx)).round(2), index=idx)
print(
s.iloc[:5],
s.loc[:ascii_lowercase[5]],
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
So what do I do?
from pandas import Series, date_range, cut, IntervalIndex, to_datetime
from numpy.random import default_rng
from numpy import iinfo, int64
rng = default_rng(0)
idx = date_range('2000-01-01', periods=150, freq='1D').rename('date')
s = Series(rng.normal(size=len(idx)).round(2), index=idx)
bins = IntervalIndex.from_breaks([
# to_datetime(iinfo(int64).min+1, unit='ns'),
# to_datetime('2000-01-01'),
# to_datetime('2000-02-01'),
# to_datetime(iinfo(int64).max, unit='ns'),
], closed='left')
print(
# bins,
# *s.groupby(cut(s.index.get_level_values('date'), bins)).pipe(
# lambda gb: [x for _, x in gb]
# ),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
print(
f'{(a := 0) is (b := 0) = }',
f'{(a := 10) is (b := 10) = }',
f'{(a := 1_000) is (b := eval("1_000")) = }',
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 50),
)
from numpy import nan, inf
print(
f'{(a := +0) is (b := -0) = }',
f'{(a := +0.0) is (b := -0.0) = }',
f'{inf is inf = }',
f'{nan is nan = }',
f'{inf == inf = }',
f'{nan == nan = }',
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
class T:
def __eq__(self, _):
return False
x = object()
print(
f'{x is x = }',
f'{x == x = }',
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from numpy import array, nan
xs = [nan, nan]
ys = array([nan, nan])
print(
# f'{xs[0] != xs[0] = }',
# f'{xs == xs = }',
# f'{ys == ys = }',
# f'{all(x0 == x1 for x0, x1 in zip(xs, xs[1:])) = }',
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
Why does this matter?
list? What is a numpy.ndarray?from numpy import array
# `list`
# opaque collection of elements
# - fixed shape
# - dynamic size
xs = [1, 2, 3]
for x in xs: pass
# `numpy.ndarray`
# manager object serving as a “restricted computation domain”
# with broadcasting (→ ‘tensor’/‘vector’/‘matrix’)
# - dynamic shape
# - fixed size
ys = array([1, 2, 3])
ys.sum()
print(
# f'{xs + xs = }',
# f'{xs * 3 = }',
# f'{ys + ys = }',
# f'{ys * 3 = }',
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
print(
f'{True == 1 = }',
f'{False == 0 = }',
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from statistics import mean
signals = [True, False, False, True, True]
print(f'{mean(signals) = }')
from string import ascii_lowercase
from random import choice, seed; seed(0)
word = ''.join(choice(ascii_lowercase) for _ in range(10))
print(
f'{word = }',
f'{"".join(x for x in word if x not in "aeiou") = }',
f'{sum(1 for x in word if x not in "aeiou") = }',
f'{sum(x not in "aeiou" for x in word) = }',
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
Why does this matter?
from pandas import Series
from numpy.random import default_rng
rng = default_rng(0)
s = Series(rng.integers(-10, 10, size=100))
print(
s[(s > 0) & (s % 2 == 1)],
sep='\n{}\n'.format(
'\N{box drawings light horizontal}' * 40,
),
)
from contextlib import contextmanager
from time import perf_counter
@contextmanager
def timed(msg):
start = perf_counter()
try:
yield
finally:
stop = perf_counter()
print(f'{msg:<16} \N{mathematical bold capital delta}t: {stop - start:.6f}s')
from pandas import Series
from numpy.random import default_rng
rng = default_rng(0)
s = Series(rng.integers(-10, 10, size=100_000))
print(
# s,
sep='\n{}\n'.format(
'\N{box drawings light horizontal}' * 40,
),
)
# with timed('indexing'):
# s[(s > 0) & (s % 2 == 0)] *= 100
# s[(s < 0) & (s % 2 == 0)] /= 100
# with timed('masking'):
# masks = m0, m1 = (
# (s > 0) & (s % 2 == 0),
# (s < 0) & (s % 2 == 0),
# )
# (s * 100 * m0) + (s / 100 * m1) + s