ts-python

applied pandas II: Let’s “get in shape” with groupby! (Discussion)

Discussion Group (Fri Jan 29, 2021; 3:30 PM EST)

Keywords: pandas.DataFrame.groupby, pandas.DataFrame.groupby.apply, pandas.Dataframe.groupby.aggregate, pandas.DataFrame.groupby.transform

Presenter James Powell james@dutc.io
Date Friday, January 29, 2021
Time 3:30 PM EST

Sample Problem & Solution

Determine which window of moves in the game had the maximum number of pieces lost.

print('Hello!')
from pandas import date_range, Series
from numpy.random import normal
s = Series(
    normal(size=100),
    index=date_range('2021-07-04', periods=100),
    #  index=range(100),
)

print(type(s))
print(s.groupby(s.index.dayofweek).sum())
#  print(dir(s.index))
#  print(s.groupby(s.index))
_print = lambda expr: print(f' {expr} '.center(50, '-'), eval(expr), sep='\n')

from pandas import DataFrame, date_range, to_timedelta
from numpy import repeat, array, tile
from numpy.random import choice, random, randint
from random import randrange
from string import ascii_lowercase
from datetime import timedelta

from decimal import Decimal
df = DataFrame({
    'time':    repeat(date_range('2021-07-04 9:00', periods=(size := 500), freq='1H'), (rpt := 40))
             + to_timedelta(randint(0, 59, size=(size*rpt)), unit='T'),
    'ticker':  tile(choice([*ascii_lowercase], size=(size, 4)).view('<U4').ravel(), rpt),
    'price':   tile(random(size) * 100, rpt) + ((random(size*rpt) - .5) * 10),
    'signal':  randint(10, size=size*rpt),
})

df = df.set_index(['time', 'ticker']).sort_index()

from scipy.stats import zscore
from pandas.tseries.offsets import Hour
#  print(df.groupby('ticker').transform(
#      lambda df: df.reset_index(level=1, drop=True).rolling(Hour(1), min_periods=1).apply(zscore)
#  ))
#  print(df.groupby('ticker').apply(zscore))

#  print(df.groupby(df.index.get_level_values(0).hour))

from time import sleep, perf_counter
from contextlib import contextmanager
@contextmanager
def timed(msg):
    try:
        start = perf_counter()
        yield
    finally:
        stop = perf_counter()
        print(f'{msg:<24} \N{mathematical bold capital delta}t: {stop - start:.4f}s')


# 0 . . . -  0
# 1 . . . - .5
# 2 . . . - 1.5
# 3 . . . - 1.75

from numpy import repeat
with timed('numpy'):
    df.groupby('ticker').transform(
        lambda s: ((repeat(s.values, s.values.size) - s.expanding().mean()) / s.expanding().std())
    )
with timed('.apply'):
    df.groupby('ticker').transform(
        lambda df: df.expanding().apply(lambda s: zscore(s)[-1])
    )

# .apply     - very flexible, but slow
# .aggregate - 
# .transform - for similarly indexed
# .filter    - 
df.index = ...

Queen’s Gambit

from pandas import read_csv, Categorical, to_datetime, IndexSlice
from collections import namedtuple

df = read_csv('carlsen.csv',
              header=None,
              names='game move side piece pos0 pos1 pos2 pos3 pos4 pos5 pos6 pos7'.split())
index_vars = ['game', 'move', 'side', 'piece']
df = df.melt(id_vars=index_vars, value_vars=set(df.columns) - set(index_vars),
             var_name='num', value_name='pos').dropna()

df['pos'] = df['pos'].astype(int)
df['num'] = Categorical(df['num'])._ndarray
for col, cats in {'piece': 'Pawn Rook Knight Bishop Queen King'.split(),
                  'side':  'Black White'.split()}.items():
    df[col] = Categorical(df[col].str.strip(), categories=cats)
df = df.set_index([*index_vars, 'num']).sort_index()

game_0 = df.loc[IndexSlice[0, :]]

class Diffs(namedtuple('DiffsBase', 'size white black raw')):
    @classmethod
    def from_game(cls, game, *, size=5):
        windows = game_0.groupby(['move', 'side']).count().dropna()
        wh = windows.loc[IndexSlice[:, 'White', :]].diff().fillna(0).rolling(size, min_periods=1).sum()
        bl = windows.loc[IndexSlice[:, 'Black', :]].diff().fillna(0).rolling(size, min_periods=1).sum()
        return cls(size, wh, bl, game)

diffs = Diffs.from_game(game_0)
print(diffs.white.values)
from pandas import read_csv, Categorical, to_datetime, IndexSlice
from collections import namedtuple

df = read_csv('carlsen.csv',
              header=None,
              names='game move side piece pos0 pos1 pos2 pos3 pos4 pos5 pos6 pos7'.split())
index_vars = ['game', 'move', 'side', 'piece']
df = df.melt(id_vars=index_vars, value_vars=set(df.columns) - set(index_vars),
             var_name='num', value_name='pos').dropna()

df['pos'] = df['pos'].astype(int)
df['num'] = Categorical(df['num'])._ndarray
for col, cats in {'piece': 'Pawn Rook Knight Bishop Queen King'.split(),
                  'side':  'Black White'.split()}.items():
    df[col] = Categorical(df[col].str.strip(), categories=cats)
df = df.set_index([*index_vars, 'num']).sort_index()

game_0 = df.loc[IndexSlice[0, :]]
end = game_0.loc[IndexSlice[83, :]]
print(end.groupby(['side', 'piece']).count().unstack().fillna(0).convert_dtypes())
print(end.pivot_table(index=['side'],
                      columns=['piece'],
                      values=['pos'],
                      aggfunc='count',
                      fill_value=0))
from pandas import read_csv, Categorical, to_datetime, IndexSlice
from xarray import DataArray
from numpy import array, zeros

df = read_csv('carlsen.csv',
              header=None,
              names='game move side piece pos0 pos1 pos2 pos3 pos4 pos5 pos6 pos7'.split())
index_vars = ['game', 'move', 'side', 'piece']
#  df = df.melt(id_vars=index_vars, value_vars=set(df.columns) - set(index_vars),
#               var_name='num', value_name='pos').dropna()
for col, cats in {'piece': 'Pawn Rook Knight Bishop Queen King'.split(),
                  'side':  'Black White'.split()}.items():
    df[col] = Categorical(df[col].str.strip(), categories=cats)#._ndarray
df = df.set_index('game')
df['alive'] = sum(~df[f'pos{idx}'].isnull() for idx in range(8))

game_0 = df.loc[0]
state = game_0.groupby(['move', 'side'])['alive'].sum()
by_side = state.groupby(['side']).transform(
    lambda df: -df.diff().rolling(25, min_periods=1).sum().fillna(0)
)
worst_white = by_side[IndexSlice[:, 'White', :]].idxmax()
print(state.loc[(worst_white-25, 'White'):(worst_white, 'White')][IndexSlice[:, 'Black', :]])
```python
from pandas import read_csv, Categorical, to_datetime, IndexSlice, Series, DataFrame
from xarray import DataArray
from numpy import array, zeros, log

df = read_csv('carlsen.csv',
              header=None,
              names='game move side piece pos0 pos1 pos2 pos3 pos4 pos5 pos6 pos7'.split())
index_vars = ['game', 'move', 'side', 'piece']
df = df.melt(id_vars=index_vars, value_vars=set(df.columns) - set(index_vars),
             var_name='num', value_name='pos').dropna()
for col, cats in {'piece': 'Pawn Rook Knight Bishop Queen King'.split(),
                  'side':  'Black White'.split()}.items():
    df[col] = Categorical(df[col].str.strip(), categories=cats)#._ndarray

df['pos'] = df['pos'].astype(int)
df['num'] = Categorical(df['num'])._ndarray
for col, cats in {'piece': 'Pawn Rook Knight Bishop Queen King'.split(),
                  'side':  'Black White'.split()}.items():
    df[col] = Categorical(df[col].str.strip(), categories=cats)
df['progress'] = df.groupby(['game'])['move'].transform(
    lambda s: s / s.max()
)
df = df.drop('move', axis=1)
df = df.set_index(['game', 'progress', 'side', 'piece', 'num']).sort_index()

game_0 = df.loc[IndexSlice[0, :]]
att = game_0.groupby(['progress', 'side', 'piece']).count().unstack().unstack()['pos'].fillna(0).swaplevel(axis=1).sort_index(axis=1).fillna(0).convert_dtypes()

piece_count = Series({
    'Pawn':   8,
    'Rook':   2,
    'Knight': 2,
    'Bishop': 2,
    'Queen':  1,
    'King':   1,
})
piece_count.index = df.index.levels[-2]
res = (att['Black'] / piece_count) #.plot(kind='area')
res = res / DataFrame({
    'Pawn':   res.index,
    'Rook':   res.index,
    'Knight': res.index,
    'Bishop': res.index,
    'Queen':  res.index,
    'King':   res.index,
}, index=res.index)

if (show := False):
    from matplotlib.pyplot import show
    res.plot(kind='area')
    show()

##