Keywords: namedtuples, dataclasses, code readability, reduce duplication or repetition
| Presenter | James Powell james@dutc.io |
| Date | Thursday, January 14, 2021 |
| Time | 9:30 AM EST |
tmatetmate link: tmate.io/t/dutc/ts-python
print("Let's go!")
print("Let's go!")
print("Let's go!")
from pandas import DataFrame
df = DataFrame({
'loan approved': 'Y N Y Y N N'.split(),
'credit history': [0, 1, 0, 1, 0, 1],
'income bracket': 'under-29k 30k-50k 30k-50k 30k-50k under-30k over-50k'.split(),
})
print(df.pivot_table(index='credit history', values='loan approved',
# columns='income bracket',
aggfunc=lambda s: s.map({'Y': 1, 'N': 0}).mean()))
from pandas import DataFrame, Categorical
df = DataFrame({
'loan approved': 'Y N Y Y N N'.split(),
'credit history': [0, 1, 0, 1, 0, 1],
'income bracket': 'under-30k 30k-50k 30k-50k 30k-50k under-30k over-50k'.split(),
})
df['credit_history'] = df['credit history'].astype(bool)
df['loan approved'] = df['loan approved'] == 'Y'
df['income bracket'] = Categorical(df['income bracket'])
print(df.groupby('credit history')['loan approved'].mean())
print(df.groupby(['credit history', 'income bracket'])['loan approved'].mean().unstack())
Convenience.
from pandas import Period
p = Period('4Q2021')
print(f'{p = }')
from pandas import Period, Timestamp
p = Period('4Q2021')
t = Timestamp.now()
print(f'{p = }')
print(f'{t < p.start_time = }')
from pandas import Period, Timestamp
class ContemporaneousPeriod(Period):
def __init__(self, *args, **kwargs):
super().__init__()
if self.start_time > Timestamp.now():
raise ValueError('period is in future')
p = ContemporaneousPeriod('4Q2020')
print(f'{p = }')
assert isinstance(p, ContemporaneousPeriod)
from pandas import DataFrame
from numpy import clip
from numpy.random import normal, choice
df1 = DataFrame({
'x': normal(size=(size:=10)),
'y': normal(size=size),
'z': choice([*'abc'], size=size),
})
df2 = DataFrame({
'x': normal(size=(size:=10)),
'y': normal(size=size),
'z': choice([*'abc'], size=size),
})
df1['x'] = clip(df1['x'], -.25, +.25)
df2['x'] = clip(df2['x'], -.25, +.25)
subset_df1 = df1[df1['x'] > 0]
subset_df2 = df2[df2['x'] > 0]
grouped_df1 = subset_df1.groupby('z').mean()
grouped_df2 = subset_df2.groupby('z').mean()
from pandas import DataFrame
from numpy import clip
from numpy.random import normal, choice
df1 = DataFrame({
'x': normal(size=(size:=10)),
'y': normal(size=size),
'z': choice([*'abc'], size=size),
})
df2 = DataFrame({
'x': normal(size=(size:=10)),
'y': normal(size=size),
'z': choice([*'abc'], size=size),
})
df1['x'] = clip(df1['x'], -.25, +.25)
df2['x'] = clip(df2['x'], -.25, +.25)
subset_df1_pos = df1[df1['x'] > 0]
subset_df2_pos = df2[df2['x'] > 0]
subset_df1_neg = df1[df1['x'] < 0]
subset_df2_neg = df2[df2['x'] < 0]
grouped_df1_pos = subset_df1_pos.groupby('z').mean()
grouped_df2_pos = subset_df2_pos.groupby('z').mean()
grouped_df1_neg = subset_df1_neg.groupby('z').mean()
grouped_df2_neg = subset_df2_neg.groupby('z').mean()
from pandas import DataFrame
from numpy import clip
from numpy.random import normal, choice
df1 = DataFrame({
'x': normal(size=(size:=10)),
'y': normal(size=size),
'z': choice([*'abc'], size=size),
})
df2 = DataFrame({
'x': normal(size=(size:=10)),
'y': normal(size=size),
'z': choice([*'abc'], size=size),
})
def clean_data(df):
return df.assign(x=clip(df['x'], -.25, +.25))
df1 = clean_data(df1)
df2 = clean_data(df2)
def subset_data(df):
return df1[df1['x'] > 0]
subset_df1 = subset_data(df1)
subset_df2 = subset_data(df2)
def group_data(df):
return df.groupby('z').mean()
grouped_df1 = group_data(subset_df1)
grouped_df2 = group_data(subset_df2)
from pandas import DataFrame
from numpy import clip
from numpy.random import normal, choice
df1 = DataFrame({
'x': normal(size=(size:=10)),
'y': normal(size=size),
'z': choice([*'abc'], size=size),
})
df2 = DataFrame({
'x': normal(size=(size:=10)),
'y': normal(size=size),
'z': choice([*'abc'], size=size),
})
def clean_data(df):
return df.assign(x=clip(df['x'], -.25, +.25))
df1 = clean_data(df1)
df2 = clean_data(df2)
def subset_data(df, *, mode=True):
if mode:
return df[df['x'] > 0]
return df[df['x'] < 0]
subset_df1_pos = subset_data(df1)
subset_df2_pos = subset_data(df2)
subset_df1_neg = subset_data(df1, mode=False)
subset_df2_neg = subset_data(df2, mode=False)
def group_data(df):
return df.groupby('z').mean()
grouped_df1_pos = group_data(subset_df1_pos)
grouped_df2_pos = group_data(subset_df2_pos)
grouped_df1_neg = group_data(subset_df1_neg)
grouped_df2_neg = group_data(subset_df2_neg)
from pandas import DataFrame
from numpy import clip
from numpy.random import normal, choice
from collections import namedtuple
raw_df1 = DataFrame({
'x': normal(size=(size:=10)),
'y': normal(size=size),
'z': choice([*'abc'], size=size),
})
raw_df2 = DataFrame({
'x': normal(size=(size:=10)),
'y': normal(size=size),
'z': choice([*'abc'], size=size),
})
class Analysis(namedtuple('Analysis', 'raw df')):
@classmethod
def from_raw(cls, raw):
return cls(raw, raw.assign(x=clip(raw['x'], -.25, +.25)))
def subset_data(self, *, mode=True):
if mode:
return type(self)(self.raw, self.df[self.df['x'] > 0])
return type(self)(self.raw, self.df[self.df['x'] < 0])
def group_data(self):
return type(self)(self.raw, self.df.groupby('z').mean())
df1 = Analysis.from_raw(raw_df1)
df2 = Analysis.from_raw(raw_df2)
subset_df1_pos = df1.subset_data()
subset_df2_pos = df2.subset_data()
subset_df1_neg = df1.subset_data(mode=False)
subset_df2_neg = df2.subset_data(mode=False)
grouped_df1_pos = subset_df1_pos.group_data()
grouped_df2_pos = subset_df2_pos.group_data()
grouped_df1_neg = subset_df1_neg.group_data()
grouped_df2_neg = subset_df2_neg.group_data()
from pandas import DataFrame
from numpy import clip
from numpy.random import normal, choice
from collections import namedtuple
raw_df1 = DataFrame({
'x': normal(size=(size:=10)),
'y': normal(size=size),
'z': choice([*'abc'], size=size),
})
raw_df2 = DataFrame({
'x': normal(size=(size:=10)),
'y': normal(size=size),
'z': choice([*'abc'], size=size),
})
class Analysis(namedtuple('Analysis', 'raw df')):
@classmethod
def from_raw(cls, raw):
return cls(raw, raw.assign(x=clip(raw['x'], -.25, +.25)))
def subset_data(self, *, mode=True):
if mode:
return type(self)(self.raw, self.df[self.df['x'] > 0])
return type(self)(self.raw, self.df[self.df['x'] < 0])
def group_data(self):
return type(self)(self.raw, self.df.groupby('z').mean())
df1 = Analysis.from_raw(raw_df1)
df2 = Analysis.from_raw(raw_df2)
# group_data(subset_data(df1))
grouped_df1_pos = df1.subset_data().group_data()
grouped_df2_pos = df2.subset_data().group_data()
grouped_df1_neg = df1.subset_data(mode=False).group_data()
grouped_df2_neg = df2.subset_data(mode=False).group_data()
from some_library import f
f(1)
from some_library import d
d[1]
def f(x):
return x ** 2
print(f'{f(x := 2) = }')
class mydict(dict):
def __missing__(self, k):
return k ** 2
d = mydict()
print(f'{d[(k := 2)] = }')
f(x) # "call a function"
# - means: computes something, or performs some action
# - errors: any kind
# - purity: pure or impure
# - speed: fast or slow
d[k] # "look something up"
# - means: perform a lookup
# - errors: KeyError, IndexError; only if "not found"
# - purity: (prob.) pure
# - speed: fast
d[k] # "look something up"
# - means: perform a lookup
d.a # "look something up"
# - means: perform a lookup
d[k] # "look something up"
# - means: perform a lookup
# `k` is some data → unbounded
# - errors: KeyError, IndexError; only if "not found"
d.a # "look something up"
# - means: perform a lookup
# `a` is some name → bounded
# - errors: AttributeError; prob. only if "not valid"
from pandas import DataFrame
df = DataFrame({
123: [1, 2, 3],
})
print(f'{df.columns = }')
print(f'{df[123].sum() = }')
print(f'{df.123.sum() = }')
f(x) # "call a function"
# - modalities: via keyword arguments
d[k] # "look something up"
# - modalities: externally encoded
f(x, flag=True) # bounded
f(x, mode=0.5) # unbounded
from pandas import Series
df = Series([0])
# bounded modality via descriptor protocol
df.loc[0]
df.iloc[0]
# modalities via descriptor protocol
d.mode(0.5)[...]
d.flag(True)[...]
# modalities via context managers
with d.mode(0.5) as _d:
_d[...]
with d.flag(True) as _d:
_d[...]
# are the modes fundamental or supplementary?
f(x)
f(x, mode=...)
d[k]
d.mode(...)[k]
from collections import namedtuple
from dataclasses import dataclass
from enum import Enum
from collections import namedtuple
Study = namedtuple('Study', 'input output')
# Study = namedtuple('Study', ['input', 'output'])
studies = [
Study('inputs-1.csv', 'outputs-1.csv'),
Study('inputs-2.csv', 'outputs-2.csv'),
]
for st in studies:
print(st.input, st.output)
print(st)
# for input_filename, output_filename in studies:
# ...
from collections import namedtuple
class Study(namedtuple('StudyBase', 'input output')):
pass
from pandas import MultiIndex
MultiIndex.from_tuples
MultiIndex.from_product
MultiIndex.from_arrays
from numpy import array, zeros, ones, eye
from some_future_numpy import Array
Array.from_shape
xs = Array.from_eye(4)
xs.clip()
from collections import namedtuple
class Study(namedtuple('StudyBase', 'input output')):
@classmethod
def from_csv_files(cls, input_filename, output_filename):
pass
@classmethod
def from_xls_files(cls, input_filename, output_filename):
pass
def study_from_csv_files(input_filename, output_filename):
pass
def study_from_xls_files(input_filename, output_filename):
pass
from collections import namedtuple
# from functools import total_ordering
# @total_ordering
class Study(namedtuple('StudyBase', 'input output')):
def __call__(self):
pass
def __getitem__(self):
pass
def __lt__(self, other):
pass
def __eq__(self, other):
pass
def __contains__(self, value):
pass
# 'result' in study
# study.contains('results')
# contained_in_study(study, 'result')
from numpy import array
xs = array([1, 2, 3])
print(f'{0 < xs < 5 = }')
print(f'{(0 < xs) & (xs < 5) = }')
from pandas import DataFrame
from dataclasses import dataclass
@dataclass
class Study:
input_data : DataFrame
output_data : DataFrame
threshold : float = 0.5
st = Study([1, 2, 3], ...)
print(f'{st = }')
from enum import Enum, auto
class Study(Enum):
Longitudinal = auto()
Cohort = auto()
Panel = auto()
st = Study['Panel']
assert st is Study.Panel
from random import choice
def random_strategy():
''' randomly select a shape '''
return choice(['rps'])
def beat_previous_play():
''' select the shape that would beat the opponent's previous play '''
pass
def most_common_play(n=3):
''' select the most common shape from the opponent's previous N plays '''
pass
games = [(random_strategy(), random_strategy()) for _ in range(10_000)]
results = [rules(a, b) for a, b in games]
from random import choice
from collections import Counter, deque
class Game:
def __init__(self, memory=5):
# fifo, most recent to the left
self.histories = [
deque([], maxlen=memory),
deque([], maxlen=memory)
]
self.what_beats_key = {"r": "p", "s": "r", "p": "s"}
def beat_previous_play(self, player_id):
""" select the shape that would beat the opponent's previous play """
opponents_history = self.histories[not player_id]
if len(opponents_history) < 1:
return self.random_strategy()
winning_shape = self.what_beats_key[opponents_history[0]]
return winning_shape
def most_common_play(self, player_id, n=3):
""" select the most common shape from the opponent's previous N plays """
opponents_history = self.histories[not player_id]
limit = min(n, len(opponents_history))
opponents_recent_history = list(opponents_history)[:limit]
counts = Counter(opponents_recent_history)
shape = counts.most_common()[0][0]
return shape
def random_strategy(self):
""" randomly select a shape """
return choice(["r", "p", "s"])
def show_hands(self, player_shape, challenger_shape):
self.histories[0].appendleft(player_shape)
self.histories[1].appendleft(challenger_shape)
return player_shape, challenger_shape
def rules(self, a: str, b: str) -> str:
"""
Returns who wins, given shapes played by two players a and b
a: Player
b: Challenger
Returns one of:
"1": Player wins
"X": Tie
"2": Challenger wins
"""
# XXX
if (a not in ["r", "p", "s"] or
b not in ["r", "p", "s"]):
return None
# XXX
if a == b:
return "X"
# XXX
result = "2" if b == self.what_beats_key[a] else "1"
return result
g = Game()
games = [g.show_hands(g.random_strategy(), g.beat_previous_play(1)) for _ in range(10_000)]
results = [g.rules(a, b) for a, b in games]
ranking = Counter(results)
print("1: Player 1 wins, X: tie, 2: Player 2 wins.")
print(f"{ranking = }")
from enum import Enum, auto
from random import choice
from itertools import combinations, product, count
from collections import defaultdict, Counter, deque
class Shape(Enum):
Rock = auto()
Paper = auto()
Scissors = auto()
class Winner(Enum):
Player = auto()
Challenger = auto()
beats = {
Shape.Rock: Shape.Scissors,
Shape.Scissors: Shape.Paper,
Shape.Paper: Shape.Rock,
}
beaten_by = {v:k for k,v in beats.items()}
def game(a : Shape, b : Shape):
if beats[a] == b:
return Winner.Player
elif beaten_by[a] == b:
return Winner.Challenger
STRATEGIES = {}
def strategy(name):
def dec(g):
def inner(*args, **kwargs):
gi = g(*args, **kwargs)
return gi.send
STRATEGIES[inner] = name
return inner
return dec
@strategy('dumb & random')
def random_strategy():
return (choice([*Shape]) for _ in count())
@strategy('smarter')
def most_common(n=10):
history = deque(maxlen=n)
last_play = yield
while True:
if not history:
play = choice([*Shape])
else:
play = Counter(history).most_common()[0][0]
last_play = yield play
if __name__ == '__main__':
ROUNDS = 10_000
results = defaultdict(Counter)
for p_st, c_st in product(STRATEGIES, STRATEGIES):
ps = p_st()
cs = c_st()
c, p = ps(None), cs(None)
games = ((p := ps(c), c := cs(p)) for _ in range(ROUNDS))
results[p_st, c_st].update(game(p, c) for p, c in games)
for (p_st, c_st), res_cntr in results.items():
print(f'Player: {STRATEGIES[p_st]}')
print(f'Challenger: {STRATEGIES[c_st]}')
for res, cnt in sorted(res_cntr.items(), key=str):
print(f' {res and res.name!s:<20} {cnt:,}')