“The following tale of alien encounters is true. And by true, I mean false. It’s all lies. But they’re entertaining lies. And, in the end, isn’t that the real truth? The answer is: no.” — ‘The Springfield Files’ (S08E10)
How do we handle inconsistent, missing, or poorly formatted data in our analyses? How do we ensure that our analyses are accurate if our data may have errors in it?
Do you…
Then join us for a session on efficient data cleaning in Python!
In this episode we’ll take a deep dive on a single dataset to better understand how we can effectively clean tabular data in pandas. We’ll discuss what clean data is and why untidy data can have disastrous effects while also exploring how we can best leverage pandas tidy our data.
We’ll cover topics like wrangling strings, renaming, parsing datatypes, sorting/reindexing, reshaping, and binning to ensure your data is ready to to be used by your models. Additionally we’ll provide strategies you can use to verify cleanliness of your data.
Keywords:
print("Let's take a look!")
from pandas import read_excel
df = read_excel('data1.xlsx')
print(df)
from pandas import read_excel
df = read_excel('data2.xlsx')
print(df.dropna())
from pandas import read_excel
df = read_excel('data2.xlsx', header=None)
print(df)
from contextlib import closing
from itertools import islice
from collections import namedtuple
from string import ascii_uppercase
from openpyxl import open as pyxl_open
from numpy import array
from pandas import DataFrame, to_numeric
ViewArea = namedtuple('ViewArea', 'min_row max_row min_col max_col')
with closing(pyxl_open('data2.xlsx', data_only=True)) as f:
sheet = f.worksheets[0]
view_area = ViewArea(
min_row=3, max_row=6,
min_col=0, max_col=3,
)
cells = array([
[c.value for c in islice(row, view_area.min_col, view_area.max_col+1)]
for row in islice(sheet.rows, view_area.min_row, view_area.max_row+1)
])
assert cells[0, 0].strip().casefold() == 'name'
assert cells[0, 1].strip().casefold() == 'group'
assert cells[0, 2].strip().casefold() == 'value'
names = cells[1:, 0]
groups = cells[1:, 1]
values = cells[1:, 2]
df = DataFrame({
'name': names,
'group': groups,
'value': to_numeric(values),
}).set_index('name')
print(
df,
df.groupby('group').mean(),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
print("Let's take a look!")
from pandas import read_csv
df = read_csv('dt1.csv')
print(df)
from pandas import read_csv
df = read_csv('dt1.csv', parse_dates=['date'])
print(df)
from pandas import read_csv, to_datetime
df = read_csv('dt1.csv')
df['date'] = to_datetime(df['date'])
print(df)
from pandas import to_datetime
dts = to_datetime(['2020-01-02 10:00:00', '02/01/2020', '20200102T10:00:00'])
dts = to_datetime(['2020-01-02 10:00:00', '02/01/2020', '20200102T10:00:00'], format='%Y-%m-%d')
dts = to_datetime(['2020-01-02 10:00:00', '02/01/2020', '20200102T10:00:00'], unit='s')
print(dts)
from io import StringIO
from numpy.random import default_rng
from pandas import DataFrame, to_datetime, to_timedelta, Categorical, read_csv
from contextlib import contextmanager
from time import perf_counter
from string import ascii_lowercase
@contextmanager
def timed(msg):
try:
start = perf_counter()
yield
finally:
stop = perf_counter()
print(f'{msg:<48} elapsed \N{mathematical bold capital delta}t: {stop - start:.6f}')
rng = default_rng(0)
entities = rng.choice([*ascii_lowercase], size=(3, 4)).view('<U4').ravel()
df = DataFrame({
'date': to_datetime('2020-01-01')
+ to_timedelta(rng.integers(60, size=(sz := 1_000_000)).cumsum(), unit='s'),
'entity': Categorical(rng.choice(entities, size=sz)),
'value': rng.normal(size=sz).round(2),
}).set_index('date').sort_index()
print(df.sample(3))
df.to_csv(csv := StringIO(), index=True)
csv.seek(0)
with timed('read_csv(…)'):
df = read_csv(csv, parse_dates=None)
# print(df.dtypes)
csv.seek(0)
with timed('read_csv(…)'):
df = read_csv(csv, parse_dates=['date'])
print(df.dtypes)
csv.seek(0)
with timed('read_csv(…)'):
df = read_csv(csv)
df['date'] = to_datetime(df['date'])
print(df.dtypes)
# from pandas import to_datetime
def to_datetime(col):
for dt in col:
pass
from pandas import read_csv, Timestamp
df = read_csv('dt2.csv', parse_dates=['date'], index_col=['date'])
df.index = df.index.map(Timestamp)
print(
df,
df.index,
f'{type(df.index[0]) = }',
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from pandas import read_csv, to_datetime
df = read_csv('dt2.csv')
df['date'] = to_datetime(df['date'].str[:19]).dt.tz_localize('US/Eastern')
df = df.set_index('date')
print(
df,
df.index,
f'{type(df.index[0]) = }',
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
print("Let's take a look!")
x = 2 ** 53.
print(
f'{x = :,}',
f'{x + 1 = :,}',
f'{x == x + 1 = :}',
sep='\n',
)
x = 2 ** 53.
y = 1
z = -(2 ** 53.)
print(
f'{(x + y) + z = :,}',
f'{x + (y + z) = :,}',
sep='\n',
)
float('nan')
from pandas import Series, NA
from numpy import nan
# s1 = Series([1, 2, 3, NA])
# s2 = Series([1, 2, 3, nan])
s3 = Series([1, 2, 3, NA], dtype='Int64')
print(
# s1,
# s1.dtype,
# s2,
s3.array._data,
s3.array._mask,
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from pandas import DataFrame
x = 2 ** 53
df = DataFrame({
'entity': [ 'a', 'b', 'a',],
'group': ['g1', 'g1', 'g2',],
'value': [ x, x+1, x+2,],
})
df['value'] = df['value'].astype('Int64')
print(
df,
# df.groupby(['entity', 'group'])['value'].sum(),
# df.groupby(['entity', 'group'])['value'].sum().unstack(),
# df.groupby(['entity', 'group'])['value'].sum().unstack().fillna(0),
# df.groupby(['entity', 'group'])['value'].sum().unstack().fillna(0).convert_dtypes(),
# df.groupby(['entity', 'group'])['value'].sum().unstack(fill_value=0),
df.pivot_table(
index=['entity'],
columns=['group'],
values=['value'],
aggfunc='sum',
fill_value=0,
),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from pandas import DataFrame, date_range
from numpy.random import default_rng
rng = default_rng(0)
df = DataFrame({
'date': date_range('2020-01-01', periods=(sz := 90)),
'value': rng.normal(size=sz).round(2),
}).set_index('date').sample(frac=.80, random_state=rng).sort_index()
print(
# df.head(10),
# f'{len(df) = }',
# df.reindex(date_range('2020-01-01', periods=90)).fillna(0),
# df.reindex(date_range('2020-01-01', periods=90)).bfill(),
# df.reindex(date_range('2020-01-01', periods=90)).ffill(),
# df.reindex(date_range('2020-01-01', periods=90)).interpolate(),
# df.resample('D').interpolate(),
# df.resample('D').ffill(),
# df.resample('D').bfill(),
df.resample('D').mean(),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
What topics do you want to hear about?
What tools or technologies are you struggling with?
What new technologies do you want to learn more about?
How can we better support your use of Python, pandas, and open source scientific computing tools in your work?
print("Let's take a look!")
from pandas import DataFrame
df = DataFrame({
'entity': ['a', 'b', 'c'],
'details': ['x,y', 'x', 'z'],
})
print(
df,
# df['details'].str.contains('x'),
# df['details'].str.split(',').explode(),
df.assign(
details=lambda df: df['details'].str.split(','),
).explode('details'),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from pandas import DataFrame
df = DataFrame({
'entity': ['a', 'b', 'c'],
'details': ['x ,y', ' x', 'Z '],
})
print(
df.sample(3),
df['details'].str.strip(),
df['details'].str.strip().str.split(','),
df['details'].str.strip().str.split(',').explode().str.strip(),
df['details'].str.strip().str.split(',').explode().str.strip().str.lower(),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from pandas import DataFrame
from re import compile as re_compile
df = DataFrame({
'entity': ['a', 'b', 'c'],
'details': [
'category = xy, subcategory = y',
'category = w',
'category = x, subcategory = z',
],
})
regex = re_compile(r'\s*'.join([
'category', '=', r'(?P<category>\w)',
r'(?:{})?'.format(r'\s*'.join([
',', 'subcategory', '=', r'(?P<subcategory>\w+)',
]))
]))
print(
df,
df.set_index('entity')['details'].str.extract(regex)['category'].dropna(),
df.set_index('entity')['details'].str.extract(regex)['subcategory'].dropna(),
# df.join(
# df['details'].str.extract(regex)
# ),
df.join(
df['details'].str.extract(regex)
),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
print("Let's take a look!")
from pandas import DataFrame
from numpy.random import default_rng
rng = default_rng(0)
entities = [
'Company',
'Company, Inc.',
'Company, LLC',
'Other Company',
]
df = DataFrame({
'entity': entities,
'value': rng.normal(size=len(entities)).round(2),
})
print(
df,
df.groupby('entity').sum(),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from pandas import DataFrame
from numpy.random import default_rng
from re import compile as re_compile
rng = default_rng(0)
entities = [
'Company',
'Company, Inc.',
'Company, LLC',
'Other Company',
]
df = DataFrame({
'entity': entities,
'value': rng.normal(size=len(entities)).round(2),
})
regex = re_compile('(\w+)')
print(
df,
df.groupby('entity').sum(),
df.assign(
entity=lambda df: df['entity'].str.extract(regex)
),#.groupby('entity').sum(),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from pandas import DataFrame
from numpy.random import default_rng
from re import compile as re_compile
rng = default_rng(0)
entities = [
'Company',
'Company, Inc.',
'Company, LLC',
'Other Company',
]
mapping = {
'Company, Inc.': 'Company',
'Company, LLC': 'Company',
}
df = DataFrame({
'entity': entities,
'value': rng.normal(size=len(entities)).round(2),
})
print(
df,
df['entity'].map(mapping),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from pandas import DataFrame
from numpy.random import default_rng
from re import compile as re_compile
rng = default_rng(0)
entities = [
'Company',
'Company, Inc.',
'Company, LLC',
'Other Company',
]
class passthru(dict):
def __missing__(self, key):
return key
mapping = passthru({
'Company, Inc.': 'Company',
'Company, LLC': 'Company',
})
df = DataFrame({
'entity': entities,
'value': rng.normal(size=len(entities)).round(2),
})
print(
df,
df['entity'].map(mapping),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from pandas import DataFrame, Series
from numpy.random import default_rng
from re import compile as re_compile
rng = default_rng(0)
entities = [
'Company',
'Company, Inc.',
'Company, LLC',
'Other Company',
]
mapping = Series(entities, index=entities)
mapping.loc['Company, Inc.'] = 'Company'
mapping.loc['Company, LLC'] = 'Company'
df = DataFrame({
'entity': entities,
'value': rng.normal(size=len(entities)).round(2),
})
print(
# df,
df['entity'],
# mapping.loc[df['entity']],
# df.assign(clean_entity=mapping.loc[df['entity']].values),
# df.assign(clean_entity=mapping.loc[df['entity']].values)
# .groupby('clean_entity').sum()
# ,
df.assign(clean_entity=mapping.loc[df['entity']].values)
.groupby('clean_entity').agg({
'entity': set,
'value': sum,
})
,
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
print("Let's take a look!")
# 1. initial data load
# 2. type conversions
# - Categorical
# - numeric types
# - datetime types
# 3. data extraction
# 4. entity mapping
# 5. quality/sanity checks
# 6. dataset extraction & preparation
# 7. missing data
from pandas import read_csv, to_datetime, Categorical
df = read_csv('data.csv', index_col=[0])
df['date'] = to_datetime(df['date'])
df['price'] = df['price'].astype(float)
df['volumes'] = df['volumes'].astype(int)
df = df.rename({'volumes': 'volume'}, axis='columns')
df['industry'] = df['industry'].str.extract(r'industry = (?P<industry>\w+)')
df['ticker'] = df['ticker'].str.lower()
df['ticker'] = Categorical(df['ticker'])
df['industry'] = Categorical(df['industry'])
industry = df['industry'].set_axis(df['ticker']).groupby('ticker').first()
df = df.drop('industry', axis='columns').set_index(['date', 'ticker']).sort_index()
print(
df.sample(3),
# industry,
sep='\n',
)