White Shadow (Mike Bell): Here one second, gone the next. They call me… the White Shadow! I can move so fast, all you see is my shadow.
Turbo (Ryan Reynolds): I don’t get it.
White Shadow (Mike Bell): I’m fast, like a shadow!
Turbo (Ryan Reynolds): But shadows, they’re not inheretly fast.
White Shadow (Mike Bell): White Shadowwww…
Turbo (Ryan Reynolds): I can still see you.
— Turbo (2013)
| Date | Time | Track | Meeting Link |
|---|---|---|---|
| Fri, Jan 21, 2022 | 9:30 AM EDT | Performance & Tooling | https://primetime.bluejeans.com/a2m/live-event/jkvwcrph |
These sessions are designed for a broad audience of modelers and software programmers of all backgrounds and skill levels.
Our expected audience should comprise attendees with…
pandas.During this session, we will endeavour to guide our audience to developing…
…and we will share additional tips, tricks, and in-depth guidance on all of these topics!
In previous episodes, we’ve performed very simple analyses of code, to help
motivate a better understanding of the design and use of tools like pandas
and numpy. In this episode, we’ll take a closer look at the task of analysing
code for performance—we’ll look at tools and techniques for “profiling” our
code.
We’ll discuss traditional tools, like cProfile in the standard library, and
show how they can be used to spot and address performance issues. We’ll take
this further, and discuss limitations to the traditional profiling approach.
We’ll introduce newly popular techniques, like the use of sampling profilers
such as scalene and pyspy, and show how they can identify and help resolve
performance issues in ways traditional tools might struggle.
Did you enjoy this episode? Did you learn something new that will help you as you continue or begin to use window methods in your work?
If so, stay tuned for future episodes, which may…
pandas analyses do not run out of memory.print("Let's get started!")
time.perf_counter? What is the perf command? What is the timeit? What are the %time line magic and %%time cell magic in IPython/Jupyter (and why should we include it liberally in out code?)from time import sleep
def load_data():
sleep(.3)
data = ...
return data
def clean_data(data):
sleep(.1)
cleaned_data = data
return cleaned_data
def process_data(data):
sleep(.2)
results = data
return results
raw_data = load_data()
data = clean_data(raw_data)
results = process_data(data)
print(f'{results = }')
from time import sleep, time
def load_data():
sleep(.3)
data = ...
return data
def clean_data(data):
sleep(.1)
cleaned_data = data
return cleaned_data
def process_data(data):
sleep(.2)
results = data
return results
before = time()
raw_data = load_data()
data = clean_data(raw_data)
results = process_data(data)
print(f'{results = }')
after = time()
print(f'{after - before = :.2f}')
from time import sleep, perf_counter
from contextlib import contextmanager
@contextmanager
def timed(msg):
try:
start = perf_counter()
yield
stop = perf_counter()
finally:
print(f'{msg:<16} \N{mathematical bold capital delta}t: {stop - start:.4f}s')
def load_data():
sleep(.3)
data = ...
return data
def clean_data(data):
sleep(.1)
cleaned_data = data
return cleaned_data
def process_data(data):
sleep(.2)
results = data
return results
with timed('overall'):
with timed('loading & cleaning'):
raw_data = load_data()
with timed('cleaning'):
data = clean_data(raw_data)
results = process_data(data)
print(f'{results = }')
from functools import wraps
from time import perf_counter, sleep
from random import random
def timed(f):
@wraps(f)
def inner(*args, **kwargs):
start = perf_counter()
rv = f(*args, **kwargs)
stop = perf_counter()
print(f'{f.__name__}: \N{mathematical bold capital delta}t: {stop - start:.4f}s')
return rv
return inner
@timed
def f():
sleep(random())
f()
from IPython import get_ipython
from time import sleep
from random import random
def f():
sleep(random())
get_ipython().run_cell_magic(
'time',
'',
'''
f()
''',
)
from IPython import get_ipython
from time import sleep
from random import random
def f():
sleep(random())
get_ipython().magic('%time f()')
from IPython import get_ipython
from time import sleep
from random import random
def f():
sleep(random())
get_ipython().magic('%timeit f()')
from timeit import timeit
from random import random
xs = [random() for _ in range(100_000)]
print(
f'{timeit("[x**2 for x in xs]", globals=globals(), number=100) = :.2f}',
f'{timeit("[*map(lambda x: x**2, xs)]", globals=globals(), number=100) = :.2f}',
sep='\n',
)
python -m timeit -s 'from random import random; xs = [random() for _ in range(100_000)]' -c '[x**2 for x in xs]'
python -m timeit -s 'from random import random; xs = [random() for _ in range(100_000)]' -c '[*map(lambda x: x**2, xs)]'
# sudo perf stat python3 -c 'from time import sleep; sleep(.1)'
# sudo perf stat python3 -S -c 'from time import sleep; sleep(.1)'
sudo perf stat ipython3 -c 'from time import sleep; sleep(.1)'
time python -c 'from time import sleep; sleep(.1)'
ulimit and the idea of “high watermark” memory usage?numpy.ndarray or a pandas.DataFrame (and how can these be misleading)?from numpy.random import default_rng
rng = default_rng(0)
data = rng.normal(size=100_000)
# data = rng.normal(size=100_000).astype('float16')
print(
f'{data.__array_interface__["data"][0] = :#_x}',
f'{data.nbytes = :,}',
sep='\n'
)
from numpy.random import default_rng
from numpy import shares_memory
rng = default_rng(0)
data = rng.normal(size=100_000)
view = data[:len(data)//2]
copy = data[data > 0]
print(
f'{shares_memory(data, view) = }',
f'{shares_memory(data, copy) = }',
sep='\n',
)
from pandas import DataFrame
from numpy.random import default_rng
from string import ascii_lowercase
rng = default_rng(0)
df = DataFrame({
'a': rng.normal(size=100_000),
'b': rng.integers(10, size=100_000),
# 'c': [''.join(rng.choice([*ascii_lowercase], size=4)) for _ in range(100_000)],
'd': rng.choice([*ascii_lowercase], size=(4, 100_000)).view('<U4').ravel(),
})
print(
f'{df.memory_usage()}',
f'{df.memory_usage(deep=True)}',
f'{df._data}',
sep='\n'
)
class T:
def __del__(self):
print(f'T.__del__({self})')
obj = T()
del obj
def f():
obj = T()
f()
from gc import get_objects, get_referents, get_referrers
class A: pass
class B: pass
class C: pass
x, y, z = A(), B(), C()
x.y, x.z = y, z
print(
f'{[obj for obj in get_objects() if isinstance(obj, (A, B, C))] = }',
f'{[obj for obj in get_referents(x.__dict__) if isinstance(obj, (A, B, C))] = }',
f'{[obj for obj in get_referrers(y)][0] = }',
sep='\n',
)
ulimit -v 69000; python -c 'from numpy.random import normal; xs = normal(size=100_000)'
tracemalloc to identify which parts of my code are allocating memory?tracemalloc module, how do I use it, and how do I analyse its outputs? What are its limitations?from tracemalloc import start, take_snapshot; start()
from contextlib import contextmanager
from random import random
@contextmanager
def profile_memory():
try:
before = take_snapshot()
yield
finally:
after = take_snapshot()
for stat in after.compare_to(before, 'lineno')[:5]:
print(stat)
with profile_memory():
xs = [random() for _ in range(100_000)]
ys = [random() for _ in range(200_000)]
pass
from tracemalloc import start, take_snapshot; start()
from contextlib import contextmanager
from numpy.random import default_rng
from numpy import sqrt
rng = default_rng(0)
@contextmanager
def profile_memory():
try:
before = take_snapshot()
yield
finally:
after = take_snapshot()
for stat in after.compare_to(before, 'filename')[:5]:
print(stat)
a, b, c = rng.integers(10, size=(size := 100_00)), rng.integers(10, size=size), rng.integers(10, size=size)
mask = (b**2 > 4*a*c) & (2*a != 0)
a, b, c = a[mask], b[mask], c[mask]
with profile_memory():
# xs = -b + sqrt(b**2 - 4*a*c) / (2*a)
term0 = 2. * a
term1 = term0.copy()
term1 *= 2
term1 *= c
term1 *= -1
term1 += b**2
sqrt(term1, out=term1)
term1 -= b
xs = term1
del term0, term1
from IPython import get_ipython
from numpy.random import default_rng
from numpy import sqrt
rng = default_rng(0)
a, b, c = rng.integers(10, size=(size := 100_00)), rng.integers(10, size=size), rng.integers(10, size=size)
mask = (b**2 > 4*a*c) & (2*a != 0)
a, b, c = a[mask], b[mask], c[mask]
def roots(a, b, c):
return -b + sqrt(b**2 - 4*a*c) / (2*a)
get_ipython().magic('%mprun -f roots roots(a, b, c)')
cProfile to identify which parts of my code are slowing things down?cProfile module, how do I use it, and how do I analyse its outputs? What are its limitations?from cProfile import run
from time import sleep
def f():
sleep(.1)
def g():
for _ in range(100):
f()
run('g()')
from IPython import get_ipython
from time import sleep
def f():
sleep(.1)
def g():
for _ in range(100):
f()
get_ipython().run_cell_magic(
'prun',
'',
'g()',
)
scalene or pyspy to iteratively improve the speed of my code?cProfile, and how can they more effectively be used to direct optimisation activities?from time import sleep
from random import random
def load_data():
sleep(.2)
return ...
def f(data):
sleep(random())
return data
def g(data):
sleep(random())
return h(data)
def h(data):
sleep(random())
return data
def process_data(data):
results = f(g(data))
return results
for _ in range(50):
data = load_data()
results = process_data(data)
print('Done.')
from time import sleep
def f():
sleep(.1)
def g():
for _ in range(50):
f()
g()
from pandas import DataFrame, Categorical, to_datetime, to_timedelta, Series
from numpy.random import default_rng
from string import ascii_lowercase
from scalene import scalene_profiler
rng = default_rng(0)
df = DataFrame({
'dt': to_datetime('2000-01-01') + to_timedelta(rng.integers(5, size=(size := 500_000)).cumsum(), unit='s'),
'a': rng.random(size=size),
'b': rng.integers(10, size=size),
'c': Categorical(rng.choice([*ascii_lowercase], size=(2, size)).view('<U2').ravel()),
})
s = DataFrame({
'c': Categorical(rng.choice([*ascii_lowercase], size=(2, size)).view('<U2').ravel()),
'd': rng.integers(10, size=size),
})
# scalene_profiler.start()
print(
df.join(s, on='c', lsuffix='', rsuffix='r')
.groupby(['c', 'dt'])
.agg('mean')
.head(3)
,
df.join(s, on='c', lsuffix='', rsuffix='r')
.groupby('c')[['a', 'b', 'd']]
.transform('sum')
.head(3)
,
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from pandas import DataFrame, Categorical, to_datetime, to_timedelta, Series
from numpy import array
from numpy.random import default_rng
from string import ascii_lowercase
from itertools import product
from scalene import scalene_profiler
rng = default_rng(0)
entities = array([''.join(x) for x in product(ascii_lowercase, repeat=2)])
df = DataFrame({
'dt': to_datetime('2000-01-01') + to_timedelta(rng.integers(5, size=(size := 500_000)).cumsum(), unit='s'),
'a': rng.random(size=size),
'b': rng.integers(10, size=size),
# 'c': Categorical(rng.choice([*ascii_lowercase], size=(2, size)).view('<U2').ravel()),
'c': Categorical(rng.choice(entities, size=size)),
}).set_index('c')
s = Series(
# index=Categorical(rng.choice([*ascii_lowercase], size=(2, size)).view('<U2').ravel()),
index=Categorical(rng.choice(entities, size=size)),
data=rng.integers(10, size=size),
)
# scalene_profiler.start()
print(
df.join(s)
.groupby(['c', 'dt'])
.agg('mean')
.head(3)
,
df.join(s)
.groupby('c')[['a', 'b', 'd']]
.transform('sum')
.head(3)
,
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
What is our statistical model?
python -m timeit -c '1 + 1'
from IPython import get_ipython
get_ipython().magic('%timeit 1 + 1')
from random import random
xs = [random() for _ in range(100_000)]
f = lambda x: x**2
[x**2 for x in xs]
[f(x) for x in xs]
[*map(f, xs)]
list(map(f, xs))
setup=(
'from random import random'
'xs = [random() for _ in range(50_000)]'
'f = lambda x: x**2'
)
code=(
'[x**2 for x in xs]'
'[f(x) for x in xs]'
'[*map(f, xs)]'
'list(map(f, xs))'
)
version=( python:3.{3..10} )
for ver in "${(@)version}"; do
printf "version = %s\n" "$ver"
for c in "${(@)code}"; do
printf "code = %s\tresult = %s\n" "${(r.24.)c}" "$(python -m timeit -s "${(@j.;.)setup}" -c "${(@j.;.)code}")"
done
done
print(
'\n{}\n'.format('\N{box drawings light horizontal}'*40)
)