Index and MultiIndex in pandas)Dr. Peter Venkman (Bill Murray): Ray has gone bye-bye, Egon… what’ve you got left?
Dr. Egon Spengler (Harold Ramis): Sorry, Venkman, I’m terrified beyond the capacity for rational thought.
— Ghostbusters (1984)
| Date | Time | Track | Meeting Link |
|---|---|---|---|
| Fri Sep 10, 2021 | 9:30 AM EDT | Working with Indices and MultiIndices | Seminar VI: “Ghostbusters” |
These sessions are designed for a broad audience of modelers and software programmers of all backgrounds and skill levels.
Our expected audience should comprise attendees with at least a…
pandasDuring this session, we will endeavour to guide our audience to developing…
MultiIndex.In previous episodes, we have talked about the structure of the pandas.Series and pandas.DataFrame. We have shown examples that involve complex pandas.Index structures, including the use of explicitly hierarchical pandas.MultiIndexes.
In this session, we will take a closer look at the pandas.Index, its API, the common variations of indices we work with, and considerations when using an implicitly or explicitly hierarchical index. We will also take a close look at the use of the pandas.MultiIndex and review situations where the MultiIndex might come up, where we may want to embrace use of the MultiIndex, and common situations where we want to stay clear of its complexity!
Did you enjoy this episode? Did you learn something new that will help you as you perform analyses on pandas.DataFrames with more complex structures?
If so, stay tuned for future episodes, which may…
pandas, including the use of registered accessors, subtypes, and custom index types.pandas.Index can be used outside of the pandas.Series and pandas.DataFrame (e.g., how pandas.Index structures are used within xarray.)If there are other related topics you’d like to see covered, please reach out to Diego Torres Quintanilla.
print("Let's go!")
Rules of Python:
s = 'a,b,c'
xs = s.split(',')
xs = [x.upper() for x in xs]
Consequences:
Consequences:
from collections import namedtuple
from dataclasses import dataclass
from enum import Enum, auto
@dataclass
class BondInstrument:
issuer : object
@dataclass
class CdsInstrument:
issuer : object
Position = namedtuple('Position', 'instrument counterparty quantity')
Currency = Enum('Currency', 'AUD GBP USD')
# or
class Currency(Enum):
AUD, GBP, USD = auto(), auto(), auto()
portfolio = [
Position(BondInstrument(...), counterparty, -1_000_000),
Position(CdsInstrument(...), counterparty, 5_000_000),
]
from dataclasses import dataclass
from enum import Enum
Currency = Enum('Currency', 'AUD GBP USD')
@dataclass
class BondInstrument:
issuer : str
currency : Currency
@dataclass
class Position:
instrument : object
quantity : float
portfolio = [
Position(BondInstrument(..., Currency.USD), 10_000_000),
Position(BondInstrument(..., Currency.GBP), 10_000_000),
]
for pos in portfolio:
if pos.instrument.currency is Currency.USD:
pass
Consequence:
x, y = 1, 2
print(f'{x + y = }')
x, y = 1.1, 2.5
print(f'{x + y = }')
x, y = 1 + 1j, 2 + 5j
print(f'{x + y = }')
x, y = 'abc', 'def'
print(f'{x + y = }')
xs, ys = [1, 2, 3], [4, 5, 6]
print(f'{xs + ys = }')
from collections import Counter
c1, c2 = Counter('aabc'), Counter('aabbcd')
print(f'{c1 + c2 = }')
from numpy import array, shares_memory
portfolio = array([10_000_000, -1_000_000])
print(f'{portfolio.dtype = }')
print(f'{portfolio.shape = }')
print(f'{portfolio.strides = }')
print(f'{portfolio.__array_interface__["data"][0] = :#_x}')
print(f'{shares_memory(portfolio, portfolio[:1]) = }')
print(f'{portfolio.view("uint64") = }')
Rules of numpy:
from _utils import timed
from random import randint as py_randint
from numpy.random import randint as np_randint
from numpy import array
with timed('py create'):
py_xs = [py_randint(-1_000, +1_000) for _ in range(1_000_000)]
py_ys = [py_randint(-1_000, +1_000) for _ in range(1_000_000)]
with timed('np create'):
np_xs = np_randint(-1_000, +1_000, size=1_000_000)
np_ys = np_randint(-1_000, +1_000, size=1_000_000)
dot = lambda xs, ys: sum(x * y for x, y in zip(xs, ys))
with timed('py dot'):
dot(py_xs, py_ys)
with timed('np dot'):
np_xs.dot(np_ys)
with timed('np data, py dot'):
dot(np_xs, np_ys)
with timed('py data, np dot'):
array(py_xs).dot(py_ys)
Consequences:
Rules of numpy (when working with numpy.ndarrays):
from numpy import array
xs = array([1, 2, 3])
ys = array([4, 5, 6])
print(f'{xs + 1 = }')
print(f'{xs + ys = }')
from numpy import array, broadcast_to
xs = array([1, 2, 3])
print(f'{xs + 1 = }')
print(f'{xs = }')
print(f'{ broadcast_to(1, xs.shape) = }')
print(f'{xs + broadcast_to(1, xs.shape) = }')
from numpy.random import default_rng
rng = default_rng(0)
xs = rng.integers(-10, +10, size=(2, 3, 4))
ys = rng.integers(-10, +10, size=(3, 4))
print(f'{(xs + ys).shape = }')
xs = rng.integers(-10, +10, size=(2, 3, 4))
ys = rng.integers(-10, +10, size=(2, 3))
# print(f'{(xs + ys).shape = }')
xs = rng.integers(-10, +10, size=(2, 3, 4))
ys = rng.integers(-10, +10, size=(2, 3, 2))
# print(f'{(xs + ys).shape = }')
from numpy.random import default_rng
rng = default_rng(0)
xs = rng.integers(-10, +10, size=(2, 3, 4))
print(xs)
print(f'{xs.strides = }')
Rules of “broadcasting”:
from numpy import newaxis
from numpy.random import default_rng
rng = default_rng(0)
xs = rng.integers(-10, +10, size=(2, 3, 4))
ys = rng.integers(-10, +10, size=( 3, 4))
print(f'{(xs + ys).shape = }')
xs = rng.integers(-10, +10, size=(2, 3, 4))
ys = rng.integers(-10, +10, size=( 2, 3))
# print(f'{(xs + ys).shape = }')
xs = rng.integers(-10, +10, size=(2, 3, 4))
ys = rng.integers(-10, +10, size=( 2, 3))
# ys.reshape(2, 3, 1) 2, 3, 1
# ys[..., newaxis] 2, 3, 1
print(f'{(xs + ys[..., newaxis]).shape = }')
Rules of numpy (when working with pandas.Series & pandas.DataFrame):
from numpy import ceil
from pandas import DataFrame, Series, date_range, merge, CategoricalIndex, Interval, MultiIndex, IndexSlice
from enum import Enum
from numpy.random import default_rng
rng = default_rng(0)
Product = Enum('Product', 'hot_dog hot_sausage lemonade')
Ingredient = Enum('Ingredient', 'bun frankfurter sausage onion lemon sugar')
prices = Series({
Ingredient.bun: 1.00,
Ingredient.frankfurter: 1.50,
Ingredient.sausage: 1.75,
Ingredient.onion: 0.75,
Ingredient.lemon: 0.25,
Ingredient.sugar: 1.25,
})
prices.name = 'prices'
prices.index = CategoricalIndex(prices.index, name='ingredient')
recipes = DataFrame({
Product.hot_dog:
Series({Ingredient.bun: .1, Ingredient.frankfurter: .125}),
Product.hot_sausage:
Series({Ingredient.bun: .1, Ingredient.sausage: .125, Ingredient.onion: .25}),
Product.lemonade:
Series({Ingredient.lemon: 4, Ingredient.sugar: .025}),
}).fillna(0)
recipes.columns = CategoricalIndex(recipes.columns, name='product')
recipes.index = CategoricalIndex(recipes.index, name='ingredient')
recipes = recipes.T
sales = DataFrame({
Product.hot_dog: rng.integers(1, 100, size=(size := 5)),
Product.hot_sausage: rng.integers(1, 100, size=size),
Product.lemonade: rng.integers(1, 100, size=size),
}, index=date_range('2021-07-04', periods=size)).rename_axis('date')
sales.columns = CategoricalIndex(sales.columns, name='product')
print(
prices,
recipes,
sales,
# (recipes * prices),
# (recipes * prices).sum(axis='columns'),
# (recipes @ prices),
# (recipes @ prices) * sales,
# ((recipes @ prices) * sales).sum(),
recipes.stack(),
sales.T.stack(),
merge(
Series(recipes.stack(), name='recipes'),
Series(sales.T.stack(), name='sales'),
left_index=True, right_index=True,
).reorder_levels(['date', 'product', 'ingredient']).sort_index()
.product(axis='columns')
.pipe(ceil)
.pipe(lambda df: df * prices)
.groupby(['date', 'product']).sum()
.pipe(lambda df: df / sales.stack())
.unstack()
.plot()
,
sep='\n\n',
)
from matplotlib.pyplot import show; show()
from pandas import Series, date_range, concat
from numpy.random import default_rng
rng = default_rng(0)
idx = date_range('2021-01-01', periods=10, freq='12H')
s = Series(rng.integers(-10, +10, size=len(idx)), index=idx)
s.attrs['metadata'] = ...
print(
s,
s.attrs,
s.groupby(s.index.date).mean(),
s.groupby(s.index.date).mean().attrs,
s.loc['2021-01-01'],
s.loc['2021-01-01'].attrs,
s.loc[:'2021-01-01'],
s.loc[:'2021-01-01'].attrs,
s.groupby(s.index.date).agg('mean').attrs,
s.groupby(s.index.date).agg(lambda s: s.mean()).attrs,
sep='\n\n',
)
pandas.Indexfrom pandas import Series
from numpy.random import default_rng
rng = default_rng(0)
s = Series(rng.integers(-10, +10, size=10))
print(
s,
# s.array,
# s.array._ndarray,
s.iloc[0],
s.iloc[0:],
sep='\n\n',
)
from pandas import Series, date_range
from numpy.random import default_rng
rng = default_rng(0)
idx = date_range('2021-01-01', periods=(size := 10), freq='12H')
s = Series(rng.integers(-10, +10, size=size), index=idx)
print(
s,
s.index,
s.loc['2021-01-01'],
# s.loc['2021-01-01':],
# s['2021-01-01'], # implicit label
# s[0:1], # implicit position
sep='\n\n',
)
from pandas import Index
import pandas
print(
'\n'.join(sorted({
attr for attr in dir(pandas)
if isinstance(obj := getattr(pandas, attr), type)
and issubclass(obj, Index)
}))
)
from pandas import Series, Float64Index, Int64Index
from numpy.random import default_rng
rng = default_rng(0)
idx = Int64Index(rng.integers(-10, +10, size=10))
idx = Float64Index(rng.random(size=10))
idx = Float64Index([.5, 1, 1.5])
s = Series(rng.integers(-10, +10, size=len(idx)), index=idx)
print(
s,
# s.index,
# s.index.array,
# s.index.array._ndarray,
# s.loc[0.912756],
s.loc[.5],
sep='\n\n',
)
from pandas import RangeIndex, Series
from numpy.random import default_rng
rng = default_rng(0)
idx = RangeIndex(0, 10, 2)
s = Series(rng.integers(-10, +10, size=len(idx)), index=idx)
print(
s,
# s.index,
# s.index.array,
# s.index.array._ndarray,
sep='\n\n',
)
print(
# f'{RangeIndex(0, 10) = }',
# f'{RangeIndex(0, 10_000_000_000_000_000_000) = }',
# f'{RangeIndex(10).get_loc = }',
# f'{RangeIndex(10).is_monotonic = }',
# f'{RangeIndex(10).union = }',
# f'{RangeIndex(10).intersection = }',
# f'{RangeIndex(10).difference = }',
sep='\n',
)
print(
f'{RangeIndex(5).union(RangeIndex(5, 10)) = }',
f'{RangeIndex(5).union(RangeIndex(7, 10)) = }',
sep='\n',
)
from pandas import Series, Interval, Index
from numpy.random import default_rng
rng = default_rng(0)
idx = Index([
Interval( 0, 10, closed='left'),
Interval(10, 20, closed='left'),
Interval(20, 30, closed='left'),
])
s = Series(rng.integers(-10, +10, size=len(idx)), index=idx)
print(
s,
s.loc[5],
# s.index,
# s.index.array,
s.index.array._left,
s.index.array._right,
sep='\n\n',
)
from pandas import to_datetime, timedelta_range, date_range, Period, Timedelta, Index, Series
from numpy.random import default_rng
rng = default_rng(0)
idx = Index([
to_datetime('2021-01-01'),
to_datetime('2021-01-02'),
to_datetime('2021-01-03'),
])
idx = date_range('2021-01-01', periods=3)
idx = Index([
to_datetime('2021-01-01'),
to_datetime('2021-01-02'),
to_datetime('2021-01-03'),
]) - to_datetime('2021-01-01')
idx = timedelta_range(start='0 day', end='10 day')
idx = Index([
Period('2021Q1'),
Period('2021Q2'),
Period('2021Q3'),
])
s = Series(rng.integers(-10, +10, size=len(idx)), index=idx)
print(
s,
s.index,
s.loc['2021-01-01':'2021-05-01'],
sep='\n\n',
)
from pandas import Index, CategoricalIndex, Series
from numpy.random import default_rng
rng = default_rng(0)
# idx = Index([
# 'abc',
# 'def',
# 'xyz',
# 'abc',
# 'abc',
# ])
idx = CategoricalIndex([
'abc',
'def',
'xyz',
'abc',
'abc',
])
s = Series(rng.integers(-10, +10, size=len(idx)), index=idx)
print(
s,
s.index,
sep='\n\n',
)
pandas.MultiIndexfrom pandas import Series, MultiIndex, date_range, IndexSlice, DataFrame
from numpy.random import default_rng
from numpy import repeat, tile
rng = default_rng(0)
idx = MultiIndex.from_product([
date_range('2021-01-01', periods=3),
['abc', 'def', 'xyz'],
])
s = Series(rng.integers(-10, +10, size=len(idx)), index=idx)
dates = date_range('2021-01-01', periods=6, freq='12H')
tickers = ['abc', 'def', 'xyz']
df = DataFrame({
'date': repeat(dates, len(tickers)),
'ticker': tile(tickers, len(dates)),
'signal': rng.normal(size=len(dates) * len(tickers)),
'price': 100 * rng.random(size=len(dates) * len(tickers)),
})
# df = df.set_index(['date', 'ticker'])
print(
# s,
# s.index,
# s.loc['2021-01-01'],
# s.loc['2021-01-01', 'abc'],
# s.loc[IndexSlice[:, 'abc', :]],
sep='\n\n',
)
print(
df,
# df.groupby([df['date'].dt.date, df['ticker']]).agg('mean'),
# df.set_index(['date', 'ticker']).pipe(
# lambda df: df.groupby(df.index.get_level_values('date').date).agg({'price': ['min', 'max']})
# ),
# df.set_index(['date', 'ticker']).pipe(
# lambda df: df.groupby(df.index.get_level_values('date').date).agg({'price': ['min', 'max']})
# ).columns,
df.set_index(['date', 'ticker']).pipe(
lambda df: df.groupby(df.index.get_level_values('date').date).agg({'price': ['min', 'max'], 'signal': ['mean', 'var']})
)['signal'],
sep='\n\n',
)
pandas.MultiIndex Challengefrom numpy import ceil
from pandas import DataFrame, Series, date_range, merge, CategoricalIndex, Interval, MultiIndex, IndexSlice
from enum import Enum
from numpy.random import default_rng
rng = default_rng(0)
Product = Enum('Product', 'hot_dog hot_sausage lemonade')
Ingredient = Enum('Ingredient', 'bun frankfurter sausage onion lemon sugar')
prices = Series({
Ingredient.bun: 1.00,
Ingredient.frankfurter: 1.50,
Ingredient.sausage: 1.75,
Ingredient.onion: 0.75,
Ingredient.lemon: 0.25,
Ingredient.sugar: 1.25,
})
prices.name = 'prices'
prices.index = CategoricalIndex(prices.index, name='ingredient')
recipes = DataFrame({
Product.hot_dog:
Series({Ingredient.bun: .1, Ingredient.frankfurter: .125}),
Product.hot_sausage:
Series({Ingredient.bun: .1, Ingredient.sausage: .125, Ingredient.onion: .25}),
Product.lemonade:
Series({Ingredient.lemon: 4, Ingredient.sugar: .025}),
}).fillna(0)
recipes.columns = CategoricalIndex(recipes.columns, name='product')
recipes.index = CategoricalIndex(recipes.index, name='ingredient')
recipes = recipes.T
sales = DataFrame({
Product.hot_dog: rng.integers(1, 100, size=(size := 5)),
Product.hot_sausage: rng.integers(1, 100, size=size),
Product.lemonade: rng.integers(1, 100, size=size),
}, index=date_range('2021-07-04', periods=size)).rename_axis('date')
sales.columns = CategoricalIndex(sales.columns, name='product')
minimums = DataFrame({
Ingredient.bun: Series({
Interval(0, 5, closed='left'): 5,
**{Interval(x, x+1, closed='left'): x for x in range(5, 250)},
Interval(250, float('inf'), closed='left'): 250,
}),
Ingredient.frankfurter: Series({
Interval(0, 15, closed='left'): 15,
**{Interval(x, x+1, closed='left'): x for x in range(15, 100)},
Interval(100, float('inf'), closed='left'): 100,
}),
Ingredient.sausage: Series({
Interval(0, 15, closed='left'): 15,
**{Interval(x, x+1, closed='left'): x for x in range(15, 100)},
Interval(100, float('inf'), closed='left'): 100,
}),
Ingredient.onion: Series({
Interval(x, x+1, closed='left'): x for x in range(0, 500)
}),
Ingredient.lemon: Series({
Interval(0, 5, closed='left'): 5,
**{Interval(x, x+1, closed='left'): x for x in range(5, 500)},
Interval(500, float('inf'), closed='left'): 500,
}),
Ingredient.sugar: Series({
Interval(x, x+1, closed='left'): x for x in range(0, 50)
}),
})
print(
# prices,
# recipes,
# sales,
minimums,
# merge(
# Series(recipes.stack(), name='recipes'),
# Series(sales.T.stack(), name='sales'),
# left_index=True, right_index=True,
# ).reorder_levels(['date', 'product', 'ingredient']).sort_index()
# .product(axis='columns')
# # .pipe(ceil)
# # .pipe(lambda df: df * prices)
# # .groupby(['date', 'product']).sum()
# # .pipe(lambda df: df / sales.stack())
# # .unstack()
# # .plot()
# ,
sep='\n\n',
)
from matplotlib.pyplot import show; show()
from pandas import DataFrame, merge
df1 = DataFrame({
'a': [0, 1, 2],
'b': [0, 1, 2],
'values1': [0, 1, 2],
}).set_index(['a', 'b'])
# df1.index.names = None, None
df2 = DataFrame({
'a': [0, 1, 2],
'values2': [0, 1, 2],
}).set_index(['a'])
print(
# merge(df1, df2, left_index=True, right_index=True),
df1.join(df2),
)