ts-python

Python Fundamentals II: Eliminate adhoc naming and repetitive code with objects

Seminar (Thu Jan 14, 2021; 9:30 AM EST)

Keywords: namedtuples, dataclasses, code readability, reduce duplication or repetition

Presenter James Powell james@dutc.io
Date Thursday, January 14, 2021
Time 9:30 AM EST

tmate

tmate link: tmate.io/t/dutc/ts-python

Goals

Why Not Objects?

print("Let's go!")
print("Let's go!")
print("Let's go!")
from pandas import DataFrame
df = DataFrame({
    'loan approved':  'Y N Y Y N N'.split(),
    'credit history': [0, 1, 0, 1, 0, 1],
    'income bracket': 'under-29k 30k-50k 30k-50k 30k-50k under-30k over-50k'.split(),
})

print(df.pivot_table(index='credit history', values='loan approved',
                     #  columns='income bracket',
                     aggfunc=lambda s: s.map({'Y': 1, 'N': 0}).mean()))
from pandas import DataFrame, Categorical

df = DataFrame({
    'loan approved':  'Y N Y Y N N'.split(),
    'credit history': [0, 1, 0, 1, 0, 1],
    'income bracket': 'under-30k 30k-50k 30k-50k 30k-50k under-30k over-50k'.split(),
})

df['credit_history'] = df['credit history'].astype(bool)
df['loan approved']  = df['loan approved'] == 'Y'
df['income bracket'] = Categorical(df['income bracket'])

print(df.groupby('credit history')['loan approved'].mean())
print(df.groupby(['credit history', 'income bracket'])['loan approved'].mean().unstack())
from pandas import Period
p = Period('4Q2021')
print(f'{p = }')
from pandas import Period, Timestamp
p = Period('4Q2021')
t = Timestamp.now()
print(f'{p = }')
print(f'{t < p.start_time = }')
from pandas import Period, Timestamp
class ContemporaneousPeriod(Period):
    def __init__(self, *args, **kwargs):
        super().__init__()
        if self.start_time > Timestamp.now():
            raise ValueError('period is in future')
                
p = ContemporaneousPeriod('4Q2020')
print(f'{p = }')

assert isinstance(p, ContemporaneousPeriod)
from pandas import DataFrame
from numpy import clip
from numpy.random import normal, choice

df1 = DataFrame({
    'x': normal(size=(size:=10)),
    'y': normal(size=size),
    'z': choice([*'abc'], size=size),
})

df2 = DataFrame({
    'x': normal(size=(size:=10)),
    'y': normal(size=size),
    'z': choice([*'abc'], size=size),
})

df1['x'] = clip(df1['x'], -.25, +.25)
df2['x'] = clip(df2['x'], -.25, +.25)

subset_df1 = df1[df1['x'] > 0]
subset_df2 = df2[df2['x'] > 0]

grouped_df1 = subset_df1.groupby('z').mean()
grouped_df2 = subset_df2.groupby('z').mean()
from pandas import DataFrame
from numpy import clip
from numpy.random import normal, choice

df1 = DataFrame({
    'x': normal(size=(size:=10)),
    'y': normal(size=size),
    'z': choice([*'abc'], size=size),
})

df2 = DataFrame({
    'x': normal(size=(size:=10)),
    'y': normal(size=size),
    'z': choice([*'abc'], size=size),
})

df1['x'] = clip(df1['x'], -.25, +.25)
df2['x'] = clip(df2['x'], -.25, +.25)

subset_df1_pos = df1[df1['x'] > 0]
subset_df2_pos = df2[df2['x'] > 0]
subset_df1_neg = df1[df1['x'] < 0]
subset_df2_neg = df2[df2['x'] < 0]

grouped_df1_pos = subset_df1_pos.groupby('z').mean()
grouped_df2_pos = subset_df2_pos.groupby('z').mean()
grouped_df1_neg = subset_df1_neg.groupby('z').mean()
grouped_df2_neg = subset_df2_neg.groupby('z').mean()
from pandas import DataFrame
from numpy import clip
from numpy.random import normal, choice

df1 = DataFrame({
    'x': normal(size=(size:=10)),
    'y': normal(size=size),
    'z': choice([*'abc'], size=size),
})

df2 = DataFrame({
    'x': normal(size=(size:=10)),
    'y': normal(size=size),
    'z': choice([*'abc'], size=size),
})

def clean_data(df):
    return df.assign(x=clip(df['x'], -.25, +.25))

df1 = clean_data(df1)
df2 = clean_data(df2)

def subset_data(df):
    return df1[df1['x'] > 0]

subset_df1 = subset_data(df1)
subset_df2 = subset_data(df2)

def group_data(df):
    return df.groupby('z').mean()

grouped_df1 = group_data(subset_df1)
grouped_df2 = group_data(subset_df2)
from pandas import DataFrame
from numpy import clip
from numpy.random import normal, choice

df1 = DataFrame({
    'x': normal(size=(size:=10)),
    'y': normal(size=size),
    'z': choice([*'abc'], size=size),
})

df2 = DataFrame({
    'x': normal(size=(size:=10)),
    'y': normal(size=size),
    'z': choice([*'abc'], size=size),
})

def clean_data(df):
    return df.assign(x=clip(df['x'], -.25, +.25))

df1 = clean_data(df1)
df2 = clean_data(df2)

def subset_data(df, *, mode=True):
    if mode:
        return df[df['x'] > 0]
    return df[df['x'] < 0]

subset_df1_pos = subset_data(df1)
subset_df2_pos = subset_data(df2)
subset_df1_neg = subset_data(df1, mode=False)
subset_df2_neg = subset_data(df2, mode=False)

def group_data(df):
    return df.groupby('z').mean()

grouped_df1_pos = group_data(subset_df1_pos)
grouped_df2_pos = group_data(subset_df2_pos)
grouped_df1_neg = group_data(subset_df1_neg)
grouped_df2_neg = group_data(subset_df2_neg)
from pandas import DataFrame
from numpy import clip
from numpy.random import normal, choice
from collections import namedtuple

raw_df1 = DataFrame({
    'x': normal(size=(size:=10)),
    'y': normal(size=size),
    'z': choice([*'abc'], size=size),
})

raw_df2 = DataFrame({
    'x': normal(size=(size:=10)),
    'y': normal(size=size),
    'z': choice([*'abc'], size=size),
})

class Analysis(namedtuple('Analysis', 'raw df')):
    @classmethod
    def from_raw(cls, raw):
        return cls(raw, raw.assign(x=clip(raw['x'], -.25, +.25)))

    def subset_data(self, *, mode=True):
        if mode:
            return type(self)(self.raw, self.df[self.df['x'] > 0])
        return type(self)(self.raw, self.df[self.df['x'] < 0])
        
    def group_data(self):
        return type(self)(self.raw, self.df.groupby('z').mean())
        
df1 = Analysis.from_raw(raw_df1)
df2 = Analysis.from_raw(raw_df2)

subset_df1_pos = df1.subset_data()
subset_df2_pos = df2.subset_data()
subset_df1_neg = df1.subset_data(mode=False)
subset_df2_neg = df2.subset_data(mode=False)

grouped_df1_pos = subset_df1_pos.group_data()
grouped_df2_pos = subset_df2_pos.group_data()
grouped_df1_neg = subset_df1_neg.group_data()
grouped_df2_neg = subset_df2_neg.group_data()
from pandas import DataFrame
from numpy import clip
from numpy.random import normal, choice
from collections import namedtuple

raw_df1 = DataFrame({
    'x': normal(size=(size:=10)),
    'y': normal(size=size),
    'z': choice([*'abc'], size=size),
})

raw_df2 = DataFrame({
    'x': normal(size=(size:=10)),
    'y': normal(size=size),
    'z': choice([*'abc'], size=size),
})

class Analysis(namedtuple('Analysis', 'raw df')):
    @classmethod
    def from_raw(cls, raw):
        return cls(raw, raw.assign(x=clip(raw['x'], -.25, +.25)))

    def subset_data(self, *, mode=True):
        if mode:
            return type(self)(self.raw, self.df[self.df['x'] > 0])
        return type(self)(self.raw, self.df[self.df['x'] < 0])
        
    def group_data(self):
        return type(self)(self.raw, self.df.groupby('z').mean())
        
df1 = Analysis.from_raw(raw_df1)
df2 = Analysis.from_raw(raw_df2)

                # group_data(subset_data(df1))
grouped_df1_pos = df1.subset_data().group_data()
grouped_df2_pos = df2.subset_data().group_data()
grouped_df1_neg = df1.subset_data(mode=False).group_data()
grouped_df2_neg = df2.subset_data(mode=False).group_data()

Interlude: Design Exercise

from some_library import f
f(1)

from some_library import d
d[1]
def f(x):
    return x ** 2

print(f'{f(x := 2)   = }')

class mydict(dict):
    def __missing__(self, k):
        return k ** 2
        
d = mydict()
print(f'{d[(k := 2)] = }')
f(x) # "call a function"
     # - means:  computes something, or performs some action
     # - errors: any kind
     # - purity: pure or impure
     # - speed:  fast or slow

d[k] # "look something up"
     # - means:  perform a lookup
     # - errors: KeyError, IndexError; only if "not found"
     # - purity: (prob.) pure
     # - speed:  fast
d[k] # "look something up"
     # - means: perform a lookup

d.a  # "look something up"
     # - means: perform a lookup
d[k] # "look something up"
     # - means: perform a lookup
     #          `k` is some data → unbounded
     # - errors: KeyError, IndexError; only if "not found"

d.a  # "look something up"
     # - means: perform a lookup
     #          `a` is some name → bounded
     # - errors: AttributeError; prob. only if "not valid"
from pandas import DataFrame
df = DataFrame({
    123: [1, 2, 3],
})

print(f'{df.columns    = }')
print(f'{df[123].sum() = }')
print(f'{df.123.sum()  = }')
f(x) # "call a function"
     # - modalities: via keyword arguments

d[k] # "look something up"
     # - modalities: externally encoded
f(x, flag=True) # bounded
f(x, mode=0.5)  # unbounded
from pandas import Series
df = Series([0])

# bounded modality via descriptor protocol
df.loc[0]
df.iloc[0]
# modalities via descriptor protocol
d.mode(0.5)[...]
d.flag(True)[...]

# modalities via context managers
with d.mode(0.5) as _d:
    _d[...]
with d.flag(True) as _d:
    _d[...]
# are the modes fundamental or supplementary?

f(x)
f(x, mode=...)

d[k]
d.mode(...)[k]

Why Objects? (And Some Mechanics)

from collections import namedtuple
from dataclasses import dataclass
from enum import Enum
from collections import namedtuple

Study = namedtuple('Study', 'input output')
#  Study = namedtuple('Study', ['input', 'output'])

studies = [
    Study('inputs-1.csv', 'outputs-1.csv'),
    Study('inputs-2.csv', 'outputs-2.csv'),
]

for st in studies:
    print(st.input, st.output)
    print(st)

#  for input_filename, output_filename in studies:
#      ...
from collections import namedtuple

class Study(namedtuple('StudyBase', 'input output')):
    pass
from pandas import MultiIndex
MultiIndex.from_tuples
MultiIndex.from_product
MultiIndex.from_arrays
from numpy import array, zeros, ones, eye
from some_future_numpy import Array
Array.from_shape
xs = Array.from_eye(4)
xs.clip()

from collections import namedtuple

class Study(namedtuple('StudyBase', 'input output')):
    @classmethod
    def from_csv_files(cls, input_filename, output_filename):
        pass
    @classmethod
    def from_xls_files(cls, input_filename, output_filename):
        pass

def study_from_csv_files(input_filename, output_filename):
    pass

def study_from_xls_files(input_filename, output_filename):
    pass
from collections import namedtuple
#  from functools import total_ordering

#  @total_ordering
class Study(namedtuple('StudyBase', 'input output')):
    def __call__(self):
        pass
    def __getitem__(self):
        pass
    def __lt__(self, other):
        pass
    def __eq__(self, other):
        pass
    def __contains__(self, value):
        pass

#  'result' in study
#  study.contains('results')
#  contained_in_study(study, 'result')
from numpy import array
xs = array([1, 2, 3])
print(f'{0 < xs < 5 = }')
print(f'{(0 < xs) & (xs < 5) = }')
from pandas import DataFrame
from dataclasses import dataclass

@dataclass
class Study:
    input_data  : DataFrame
    output_data : DataFrame
    threshold   : float = 0.5

st = Study([1, 2, 3], ...)
print(f'{st = }')
from enum import Enum, auto

class Study(Enum):
    Longitudinal = auto()
    Cohort       = auto()
    Panel        = auto()

st = Study['Panel']
assert st is Study.Panel

Example

from random import choice

def random_strategy():
    ''' randomly select a shape '''
    return choice(['rps'])

def beat_previous_play():
    ''' select the shape that would beat the opponent's previous play '''
    pass

def most_common_play(n=3):
    ''' select the most common shape from the opponent's previous N plays '''
    pass

games = [(random_strategy(), random_strategy()) for _ in range(10_000)]
results = [rules(a, b) for a, b in games]

Attendee Solution

from random import choice
from collections import Counter, deque

class Game:
    def __init__(self, memory=5):
        # fifo, most recent to the left
        self.histories = [
            deque([], maxlen=memory),
            deque([], maxlen=memory)
        ]
        self.what_beats_key = {"r": "p", "s": "r", "p": "s"}

    def beat_previous_play(self, player_id):
        """ select the shape that would beat the opponent's previous play """
        opponents_history = self.histories[not player_id]
        if len(opponents_history) < 1:
            return self.random_strategy()
        winning_shape = self.what_beats_key[opponents_history[0]]
        return winning_shape

    def most_common_play(self, player_id, n=3):
        """ select the most common shape from the opponent's previous N plays """
        opponents_history = self.histories[not player_id]
        limit = min(n, len(opponents_history))
        opponents_recent_history = list(opponents_history)[:limit]
        counts = Counter(opponents_recent_history)
        shape = counts.most_common()[0][0]
        return shape

    def random_strategy(self):
        """ randomly select a shape """
        return choice(["r", "p", "s"])

    def show_hands(self, player_shape, challenger_shape):
        self.histories[0].appendleft(player_shape)
        self.histories[1].appendleft(challenger_shape)
        return player_shape, challenger_shape

    def rules(self, a: str, b: str) -> str:
        """
            Returns who wins, given shapes played by two players a and b
            a: Player
            b: Challenger
            Returns one of:
                "1": Player wins
                "X": Tie
                "2": Challenger wins
        """

        # XXX
        if (a not in ["r", "p", "s"] or
                b not in ["r", "p", "s"]):
            return None

        # XXX
        if a == b:
            return "X"

        # XXX
        result = "2" if b == self.what_beats_key[a] else "1"
        return result

g = Game()
games = [g.show_hands(g.random_strategy(), g.beat_previous_play(1)) for _ in range(10_000)]
results = [g.rules(a, b) for a, b in games]
ranking = Counter(results)
print("1: Player 1 wins, X: tie, 2: Player 2 wins.")
print(f"{ranking = }")

My Solution

from enum import Enum, auto
from random import choice
from itertools import combinations, product, count
from collections import defaultdict, Counter, deque

class Shape(Enum):
    Rock     = auto()
    Paper    = auto()
    Scissors = auto()

class Winner(Enum):
    Player     = auto()
    Challenger = auto()

beats = {
    Shape.Rock:    Shape.Scissors,
    Shape.Scissors:   Shape.Paper,
    Shape.Paper:       Shape.Rock,
}
beaten_by = {v:k for k,v in beats.items()}

def game(a : Shape, b : Shape):
    if beats[a] == b:
        return Winner.Player
    elif beaten_by[a] == b:
        return Winner.Challenger

STRATEGIES = {}
def strategy(name):
    def dec(g):
        def inner(*args, **kwargs):
            gi = g(*args, **kwargs)
            return gi.send
        STRATEGIES[inner] = name
        return inner
    return dec

@strategy('dumb & random')
def random_strategy():
    return (choice([*Shape]) for _ in count())

@strategy('smarter')
def most_common(n=10):
    history = deque(maxlen=n)
    last_play = yield
    while True:
        if not history:
            play = choice([*Shape])
        else:
            play = Counter(history).most_common()[0][0]
        last_play = yield play

if __name__ == '__main__':
    ROUNDS = 10_000
    results = defaultdict(Counter)
    for p_st, c_st in product(STRATEGIES, STRATEGIES):
        ps = p_st()
        cs = c_st()
        c, p = ps(None), cs(None)
        games = ((p := ps(c), c := cs(p)) for _ in range(ROUNDS))
        results[p_st, c_st].update(game(p, c) for p, c in games)

    for (p_st, c_st), res_cntr in results.items():
        print(f'Player:     {STRATEGIES[p_st]}')
        print(f'Challenger: {STRATEGIES[c_st]}')
        for res, cnt in sorted(res_cntr.items(), key=str):
            print(f'    {res and res.name!s:<20} {cnt:,}')