ts-python

Seminar VI: “Ghostbusters” (Index and MultiIndex in pandas)

Dr. Peter Venkman (Bill Murray): Ray has gone bye-bye, Egon… what’ve you got left?

Dr. Egon Spengler (Harold Ramis): Sorry, Venkman, I’m terrified beyond the capacity for rational thought.

Ghostbusters (1984)

Date Time Track Meeting Link
Fri Sep 10, 2021 9:30 AM EDT Working with Indices and MultiIndices Seminar VI: “Ghostbusters”

Contents

Audience

These sessions are designed for a broad audience of modelers and software programmers of all backgrounds and skill levels.

Our expected audience should comprise attendees with at least a…

During this session, we will endeavour to guide our audience to developing…

Abstract

In previous episodes, we have talked about the structure of the pandas.Series and pandas.DataFrame. We have shown examples that involve complex pandas.Index structures, including the use of explicitly hierarchical pandas.MultiIndexes.

In this session, we will take a closer look at the pandas.Index, its API, the common variations of indices we work with, and considerations when using an implicitly or explicitly hierarchical index. We will also take a close look at the use of the pandas.MultiIndex and review situations where the MultiIndex might come up, where we may want to embrace use of the MultiIndex, and common situations where we want to stay clear of its complexity!

To Be Continued…

Did you enjoy this episode? Did you learn something new that will help you as you perform analyses on pandas.DataFrames with more complex structures?

If so, stay tuned for future episodes, which may…

If there are other related topics you’d like to see covered, please reach out to Diego Torres Quintanilla.

Notes

print("Let's go!")

The Rules of the Game: Python

Rules of Python:

s = 'a,b,c'
xs = s.split(',')
xs = [x.upper() for x in xs]

Consequences:

Consequences:

from collections import namedtuple
from dataclasses import dataclass
from enum import Enum, auto

@dataclass
class BondInstrument:
    issuer : object

@dataclass
class CdsInstrument:
    issuer : object

Position = namedtuple('Position', 'instrument counterparty quantity')

Currency = Enum('Currency', 'AUD GBP USD')
# or 
class Currency(Enum):
    AUD, GBP, USD = auto(), auto(), auto()

portfolio = [
    Position(BondInstrument(...), counterparty, -1_000_000),
    Position(CdsInstrument(...), counterparty, 5_000_000),
]
from dataclasses import dataclass
from enum import Enum

Currency = Enum('Currency', 'AUD GBP USD')

@dataclass
class BondInstrument:
    issuer : str
    currency : Currency

@dataclass
class Position:
    instrument : object
    quantity : float

portfolio = [
    Position(BondInstrument(..., Currency.USD), 10_000_000),
    Position(BondInstrument(..., Currency.GBP), 10_000_000),
]

for pos in portfolio:
    if pos.instrument.currency is Currency.USD:
        pass

Consequence:

x, y = 1, 2
print(f'{x  + y  = }')

x, y = 1.1, 2.5
print(f'{x  + y  = }')

x, y = 1 + 1j, 2 + 5j
print(f'{x  + y  = }')

x, y = 'abc', 'def'
print(f'{x  + y  = }')

xs, ys = [1, 2, 3], [4, 5, 6]
print(f'{xs + ys = }')

from collections import Counter
c1, c2 = Counter('aabc'), Counter('aabbcd')
print(f'{c1 + c2 = }')
from numpy import array, shares_memory

portfolio = array([10_000_000, -1_000_000])

print(f'{portfolio.dtype                          = }')
print(f'{portfolio.shape                          = }')
print(f'{portfolio.strides                        = }')
print(f'{portfolio.__array_interface__["data"][0] = :#_x}')

print(f'{shares_memory(portfolio, portfolio[:1])  = }')
print(f'{portfolio.view("uint64")                 = }')

The Rules of the Game: NumPy

Rules of numpy:

from _utils import timed
from random import randint as py_randint
from numpy.random import randint as np_randint
from numpy import array

with timed('py create'):
    py_xs = [py_randint(-1_000, +1_000) for _ in range(1_000_000)]
    py_ys = [py_randint(-1_000, +1_000) for _ in range(1_000_000)]

with timed('np create'):
    np_xs = np_randint(-1_000, +1_000, size=1_000_000)
    np_ys = np_randint(-1_000, +1_000, size=1_000_000)

dot = lambda xs, ys: sum(x * y for x, y in zip(xs, ys))

with timed('py dot'):
    dot(py_xs, py_ys)

with timed('np dot'):
    np_xs.dot(np_ys)

with timed('np data, py dot'):
    dot(np_xs, np_ys)

with timed('py data, np dot'):
    array(py_xs).dot(py_ys)

Consequences:

Rules of numpy (when working with numpy.ndarrays):

from numpy import array

xs = array([1, 2, 3])
ys = array([4, 5, 6])

print(f'{xs +  1 = }')
print(f'{xs + ys = }')
from numpy import array, broadcast_to

xs = array([1, 2, 3])

print(f'{xs + 1                         = }')
print(f'{xs                             = }')
print(f'{     broadcast_to(1, xs.shape) = }')
print(f'{xs + broadcast_to(1, xs.shape) = }')
from numpy.random import default_rng
rng = default_rng(0)

xs = rng.integers(-10, +10, size=(2, 3, 4))
ys = rng.integers(-10, +10, size=(3, 4))

print(f'{(xs + ys).shape = }')

xs = rng.integers(-10, +10, size=(2, 3, 4))
ys = rng.integers(-10, +10, size=(2, 3))

#  print(f'{(xs + ys).shape = }')

xs = rng.integers(-10, +10, size=(2, 3, 4))
ys = rng.integers(-10, +10, size=(2, 3, 2))

#  print(f'{(xs + ys).shape = }')
from numpy.random import default_rng
rng = default_rng(0)

xs = rng.integers(-10, +10, size=(2, 3, 4))

print(xs)
print(f'{xs.strides = }')

Rules of “broadcasting”:

from numpy import newaxis
from numpy.random import default_rng
rng = default_rng(0)

xs = rng.integers(-10, +10, size=(2, 3, 4))
ys = rng.integers(-10, +10, size=(   3, 4))

print(f'{(xs + ys).shape = }')

xs = rng.integers(-10, +10, size=(2, 3, 4))
ys = rng.integers(-10, +10, size=(   2, 3))

#  print(f'{(xs + ys).shape = }')

xs = rng.integers(-10, +10, size=(2, 3, 4))
ys = rng.integers(-10, +10, size=(   2, 3))
# ys.reshape(2, 3, 1)             2, 3, 1
# ys[..., newaxis]                2, 3, 1

print(f'{(xs + ys[..., newaxis]).shape = }')

The Rules of the Game: Pandas

Rules of numpy (when working with pandas.Series & pandas.DataFrame):

from numpy import ceil
from pandas import DataFrame, Series, date_range, merge, CategoricalIndex, Interval, MultiIndex, IndexSlice
from enum import Enum
from numpy.random import default_rng

rng = default_rng(0)

Product = Enum('Product', 'hot_dog hot_sausage lemonade')
Ingredient = Enum('Ingredient', 'bun frankfurter sausage onion lemon sugar')

prices = Series({
    Ingredient.bun:         1.00,
    Ingredient.frankfurter: 1.50,
    Ingredient.sausage:     1.75,
    Ingredient.onion:       0.75,
    Ingredient.lemon:       0.25,
    Ingredient.sugar:       1.25,
})
prices.name = 'prices'
prices.index = CategoricalIndex(prices.index, name='ingredient')

recipes = DataFrame({
    Product.hot_dog:
        Series({Ingredient.bun: .1, Ingredient.frankfurter: .125}),
    Product.hot_sausage:
        Series({Ingredient.bun: .1, Ingredient.sausage: .125, Ingredient.onion: .25}),
    Product.lemonade:
        Series({Ingredient.lemon: 4, Ingredient.sugar: .025}),
}).fillna(0)
recipes.columns = CategoricalIndex(recipes.columns, name='product')
recipes.index = CategoricalIndex(recipes.index, name='ingredient')
recipes = recipes.T

sales = DataFrame({
    Product.hot_dog:     rng.integers(1, 100, size=(size := 5)),
    Product.hot_sausage: rng.integers(1, 100, size=size),
    Product.lemonade:    rng.integers(1, 100, size=size),
}, index=date_range('2021-07-04', periods=size)).rename_axis('date')
sales.columns = CategoricalIndex(sales.columns, name='product')

print(
    prices,
    recipes,
    sales,

    #  (recipes * prices),
    #  (recipes * prices).sum(axis='columns'),
    #  (recipes @ prices),
    #  (recipes @ prices) * sales,
    #  ((recipes @ prices) * sales).sum(),

    recipes.stack(),
    sales.T.stack(),
    merge(
        Series(recipes.stack(), name='recipes'),
        Series(sales.T.stack(), name='sales'),
        left_index=True, right_index=True,
    ).reorder_levels(['date', 'product', 'ingredient']).sort_index()
     .product(axis='columns')
     .pipe(ceil)
     .pipe(lambda df: df * prices)
     .groupby(['date', 'product']).sum()
     .pipe(lambda df: df / sales.stack())
     .unstack()
     .plot()
     ,

    sep='\n\n',
)

from matplotlib.pyplot import show; show()
from pandas import Series, date_range, concat
from numpy.random import default_rng

rng = default_rng(0)

idx = date_range('2021-01-01', periods=10, freq='12H')
s = Series(rng.integers(-10, +10, size=len(idx)), index=idx)
s.attrs['metadata'] = ...

print(
    s,
    s.attrs,

    s.groupby(s.index.date).mean(),
    s.groupby(s.index.date).mean().attrs,

    s.loc['2021-01-01'],
    s.loc['2021-01-01'].attrs,

    s.loc[:'2021-01-01'],
    s.loc[:'2021-01-01'].attrs,

    s.groupby(s.index.date).agg('mean').attrs,
    s.groupby(s.index.date).agg(lambda s: s.mean()).attrs,

    sep='\n\n',
)

The pandas.Index

from pandas import Series
from numpy.random import default_rng

rng = default_rng(0)

s = Series(rng.integers(-10, +10, size=10))

print(
    s,
    #  s.array,
    #  s.array._ndarray,

    s.iloc[0],
    s.iloc[0:],

    sep='\n\n',
)
from pandas import Series, date_range
from numpy.random import default_rng

rng = default_rng(0)

idx = date_range('2021-01-01', periods=(size := 10), freq='12H')
s = Series(rng.integers(-10, +10, size=size), index=idx)

print(
    s,
    s.index,

    s.loc['2021-01-01'],
    #  s.loc['2021-01-01':],

    #  s['2021-01-01'], # implicit label
    #  s[0:1],          # implicit position

    sep='\n\n',
)
from pandas import Index
import pandas

print(
    '\n'.join(sorted({
        attr for attr in dir(pandas)
             if isinstance(obj := getattr(pandas, attr), type)
                and issubclass(obj, Index)
    }))
)
from pandas import Series, Float64Index, Int64Index
from numpy.random import default_rng

rng = default_rng(0)

idx = Int64Index(rng.integers(-10, +10, size=10))
idx = Float64Index(rng.random(size=10))
idx = Float64Index([.5, 1, 1.5])
s   = Series(rng.integers(-10, +10, size=len(idx)), index=idx)

print(
    s,

    #  s.index,
    #  s.index.array,
    #  s.index.array._ndarray,

    #  s.loc[0.912756],
    s.loc[.5],

    sep='\n\n',
)
from pandas import RangeIndex, Series
from numpy.random import default_rng

rng = default_rng(0)

idx = RangeIndex(0, 10, 2)
s   = Series(rng.integers(-10, +10, size=len(idx)), index=idx)

print(
    s,

    #  s.index,
    #  s.index.array,
    #  s.index.array._ndarray,

    sep='\n\n',
)

print(
    #  f'{RangeIndex(0, 10) = }',
    #  f'{RangeIndex(0, 10_000_000_000_000_000_000) = }',
    #  f'{RangeIndex(10).get_loc = }',
    #  f'{RangeIndex(10).is_monotonic = }',
    #  f'{RangeIndex(10).union = }',
    #  f'{RangeIndex(10).intersection = }',
    #  f'{RangeIndex(10).difference = }',
    sep='\n',
)

print(
    f'{RangeIndex(5).union(RangeIndex(5, 10)) = }',
    f'{RangeIndex(5).union(RangeIndex(7, 10)) = }',
    sep='\n',
)
from pandas import Series, Interval, Index
from numpy.random import default_rng

rng = default_rng(0)

idx = Index([
    Interval( 0, 10, closed='left'),
    Interval(10, 20, closed='left'),
    Interval(20, 30, closed='left'),
])

s = Series(rng.integers(-10, +10, size=len(idx)), index=idx)

print(
    s,

    s.loc[5],

    #  s.index,
    #  s.index.array,
    s.index.array._left,
    s.index.array._right,

    sep='\n\n',
)
from pandas import to_datetime, timedelta_range, date_range, Period, Timedelta, Index, Series
from numpy.random import default_rng

rng = default_rng(0)

idx = Index([
    to_datetime('2021-01-01'),
    to_datetime('2021-01-02'),
    to_datetime('2021-01-03'),
])
idx = date_range('2021-01-01', periods=3)

idx = Index([
    to_datetime('2021-01-01'),
    to_datetime('2021-01-02'),
    to_datetime('2021-01-03'),
]) - to_datetime('2021-01-01')
idx = timedelta_range(start='0 day', end='10 day')

idx = Index([
    Period('2021Q1'),
    Period('2021Q2'),
    Period('2021Q3'),
])

s = Series(rng.integers(-10, +10, size=len(idx)), index=idx)

print(
    s,

    s.index,

    s.loc['2021-01-01':'2021-05-01'],

    sep='\n\n',
)
from pandas import Index, CategoricalIndex, Series
from numpy.random import default_rng

rng = default_rng(0)

#  idx = Index([
#      'abc',
#      'def',
#      'xyz',
#      'abc',
#      'abc',
#  ])
idx = CategoricalIndex([
    'abc',
    'def',
    'xyz',
    'abc',
    'abc',
])

s = Series(rng.integers(-10, +10, size=len(idx)), index=idx)

print(
    s,

    s.index,

    sep='\n\n',
)

The pandas.MultiIndex

from pandas import Series, MultiIndex, date_range, IndexSlice, DataFrame
from numpy.random import default_rng
from numpy import repeat, tile

rng = default_rng(0)

idx = MultiIndex.from_product([
    date_range('2021-01-01', periods=3),
    ['abc', 'def', 'xyz'],
])
s = Series(rng.integers(-10, +10, size=len(idx)), index=idx)

dates = date_range('2021-01-01', periods=6, freq='12H')
tickers = ['abc', 'def', 'xyz']
df = DataFrame({
    'date':   repeat(dates, len(tickers)),
    'ticker': tile(tickers, len(dates)),
    'signal': rng.normal(size=len(dates) * len(tickers)),
    'price':  100 * rng.random(size=len(dates) * len(tickers)),
})
#  df = df.set_index(['date', 'ticker'])

print(
    #  s,

    #  s.index,

    #  s.loc['2021-01-01'],
    #  s.loc['2021-01-01', 'abc'],
    #  s.loc[IndexSlice[:, 'abc', :]],
    
    sep='\n\n',
)

print(
    df,

    #  df.groupby([df['date'].dt.date, df['ticker']]).agg('mean'),
    #  df.set_index(['date', 'ticker']).pipe(
    #      lambda df: df.groupby(df.index.get_level_values('date').date).agg({'price': ['min', 'max']})
    #  ),
    #  df.set_index(['date', 'ticker']).pipe(
    #      lambda df: df.groupby(df.index.get_level_values('date').date).agg({'price': ['min', 'max']})
    #  ).columns,
    df.set_index(['date', 'ticker']).pipe(
        lambda df: df.groupby(df.index.get_level_values('date').date).agg({'price': ['min', 'max'], 'signal': ['mean', 'var']})
    )['signal'],
    sep='\n\n',
)

A pandas.MultiIndex Challenge

from numpy import ceil
from pandas import DataFrame, Series, date_range, merge, CategoricalIndex, Interval, MultiIndex, IndexSlice
from enum import Enum
from numpy.random import default_rng

rng = default_rng(0)

Product = Enum('Product', 'hot_dog hot_sausage lemonade')
Ingredient = Enum('Ingredient', 'bun frankfurter sausage onion lemon sugar')

prices = Series({
    Ingredient.bun:         1.00,
    Ingredient.frankfurter: 1.50,
    Ingredient.sausage:     1.75,
    Ingredient.onion:       0.75,
    Ingredient.lemon:       0.25,
    Ingredient.sugar:       1.25,
})
prices.name = 'prices'
prices.index = CategoricalIndex(prices.index, name='ingredient')

recipes = DataFrame({
    Product.hot_dog:
        Series({Ingredient.bun: .1, Ingredient.frankfurter: .125}),
    Product.hot_sausage:
        Series({Ingredient.bun: .1, Ingredient.sausage: .125, Ingredient.onion: .25}),
    Product.lemonade:
        Series({Ingredient.lemon: 4, Ingredient.sugar: .025}),
}).fillna(0)
recipes.columns = CategoricalIndex(recipes.columns, name='product')
recipes.index = CategoricalIndex(recipes.index, name='ingredient')
recipes = recipes.T

sales = DataFrame({
    Product.hot_dog:     rng.integers(1, 100, size=(size := 5)),
    Product.hot_sausage: rng.integers(1, 100, size=size),
    Product.lemonade:    rng.integers(1, 100, size=size),
}, index=date_range('2021-07-04', periods=size)).rename_axis('date')
sales.columns = CategoricalIndex(sales.columns, name='product')

minimums = DataFrame({
    Ingredient.bun: Series({
        Interval(0, 5, closed='left'): 5,
        **{Interval(x, x+1, closed='left'): x for x in range(5, 250)},
        Interval(250, float('inf'), closed='left'): 250,
    }),
    Ingredient.frankfurter: Series({
        Interval(0, 15, closed='left'): 15,
        **{Interval(x, x+1, closed='left'): x for x in range(15, 100)},
        Interval(100, float('inf'), closed='left'): 100,
    }),
    Ingredient.sausage: Series({
        Interval(0, 15, closed='left'): 15,
        **{Interval(x, x+1, closed='left'): x for x in range(15, 100)},
        Interval(100, float('inf'), closed='left'): 100,
    }),
    Ingredient.onion: Series({
        Interval(x, x+1, closed='left'): x for x in range(0, 500)
    }),
    Ingredient.lemon: Series({
        Interval(0, 5, closed='left'): 5,
        **{Interval(x, x+1, closed='left'): x for x in range(5, 500)},
        Interval(500, float('inf'), closed='left'): 500,
    }),
    Ingredient.sugar: Series({
        Interval(x, x+1, closed='left'): x for x in range(0, 50)
    }),
})

print(
    #  prices,
    #  recipes,
    #  sales,
    minimums,

    #  merge(
    #      Series(recipes.stack(), name='recipes'),
    #      Series(sales.T.stack(), name='sales'),
    #      left_index=True, right_index=True,
    #  ).reorder_levels(['date', 'product', 'ingredient']).sort_index()
    #   .product(axis='columns')
    #   #  .pipe(ceil)
    #   #  .pipe(lambda df: df * prices)
    #   #  .groupby(['date', 'product']).sum()
    #   #  .pipe(lambda df: df / sales.stack())
    #   #  .unstack()
    #   #  .plot()
    #   ,

    sep='\n\n',
)

from matplotlib.pyplot import show; show()

Questions

from pandas import DataFrame, merge

df1 = DataFrame({
    'a': [0, 1, 2],
    'b': [0, 1, 2],
    'values1': [0, 1, 2],
}).set_index(['a', 'b'])
#  df1.index.names = None, None

df2 = DataFrame({
    'a': [0, 1, 2],
    'values2': [0, 1, 2],
}).set_index(['a'])

print(
    #  merge(df1, df2, left_index=True, right_index=True),
    df1.join(df2),
)