ts-python

Seminar I: A Data-Cleaning Deep-Dive with pandas (2022-06-10)

“The following tale of alien encounters is true. And by true, I mean false. It’s all lies. But they’re entertaining lies. And, in the end, isn’t that the real truth? The answer is: no.” — ‘The Springfield Files’ (S08E10)

Abstract

How do we handle inconsistent, missing, or poorly formatted data in our analyses? How do we ensure that our analyses are accurate if our data may have errors in it?

Do you…

… deal with poorly formatted Excel sheets?
… query and combine data from multiple sources?
… want to spend less time cleaning and more time modeling?
… ensure your data is less noisy?

Then join us for a session on efficient data cleaning in Python!

In this episode we’ll take a deep dive on a single dataset to better understand how we can effectively clean tabular data in pandas. We’ll discuss what clean data is and why untidy data can have disastrous effects while also exploring how we can best leverage pandas tidy our data.

We’ll cover topics like wrangling strings, renaming, parsing datatypes, sorting/reindexing, reshaping, and binning to ensure your data is ready to to be used by your models. Additionally we’ll provide strategies you can use to verify cleanliness of your data.

Keywords:

Topic: Deep dive into a hands-on data cleaning problem
Content:
- A session where the instructor walks through a data cleaning problem from end to end.
- Topics discussed include: combining data from multiple sources and formats, utilities to fill missing data, etc.

Notes

Question: How do I handle Excel data?

Question: How do I handle Excel data?

print("Let's take a look!")

from pandas import read_excel

df = read_excel('data1.xlsx')

print(df)

from pandas import read_excel

df = read_excel('data2.xlsx')

print(df.dropna())

from pandas import read_excel

df = read_excel('data2.xlsx', header=None)

print(df)

from contextlib import closing
from itertools import islice
from collections import namedtuple
from string import ascii_uppercase

from openpyxl import open as pyxl_open
from numpy import array
from pandas import DataFrame, to_numeric

ViewArea = namedtuple('ViewArea', 'min_row max_row min_col max_col')

with closing(pyxl_open('data2.xlsx', data_only=True)) as f:
    sheet = f.worksheets[0]

    view_area = ViewArea(
        min_row=3, max_row=6,
        min_col=0, max_col=3,
    )
    cells = array([
        [c.value for c in islice(row, view_area.min_col, view_area.max_col+1)]
        for row in islice(sheet.rows, view_area.min_row, view_area.max_row+1)
    ])

    assert cells[0, 0].strip().casefold() == 'name'
    assert cells[0, 1].strip().casefold() == 'group'
    assert cells[0, 2].strip().casefold() == 'value'

    names = cells[1:, 0]
    groups = cells[1:, 1]
    values = cells[1:, 2]

    df = DataFrame({
        'name':  names,
        'group': groups,
        'value': to_numeric(values),
    }).set_index('name')

    print(
        df,
        df.groupby('group').mean(),
        sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
    )

Question: How do I handle datetime data?

Question: How do I handle datetime data?

print("Let's take a look!")

from pandas import read_csv

df = read_csv('dt1.csv')

print(df)

from pandas import read_csv

df = read_csv('dt1.csv', parse_dates=['date'])

print(df)

from pandas import read_csv, to_datetime

df = read_csv('dt1.csv')
df['date'] = to_datetime(df['date'])

print(df)

from pandas import to_datetime

dts = to_datetime(['2020-01-02 10:00:00', '02/01/2020', '20200102T10:00:00'])
dts = to_datetime(['2020-01-02 10:00:00', '02/01/2020', '20200102T10:00:00'], format='%Y-%m-%d')
dts = to_datetime(['2020-01-02 10:00:00', '02/01/2020', '20200102T10:00:00'], unit='s')
print(dts)

from io import StringIO
from numpy.random import default_rng
from pandas import DataFrame, to_datetime, to_timedelta, Categorical, read_csv
from contextlib import contextmanager
from time import perf_counter
from string import ascii_lowercase

@contextmanager
def timed(msg):
    try:
        start = perf_counter()
        yield
    finally:
        stop = perf_counter()
        print(f'{msg:<48} elapsed \N{mathematical bold capital delta}t: {stop - start:.6f}')

rng = default_rng(0)

entities = rng.choice([*ascii_lowercase], size=(3, 4)).view('<U4').ravel()

df = DataFrame({
    'date': to_datetime('2020-01-01')
          + to_timedelta(rng.integers(60, size=(sz := 1_000_000)).cumsum(), unit='s'),
    'entity': Categorical(rng.choice(entities, size=sz)),
    'value': rng.normal(size=sz).round(2),
}).set_index('date').sort_index()

print(df.sample(3))

df.to_csv(csv := StringIO(), index=True)

csv.seek(0)
with timed('read_csv(…)'):
    df = read_csv(csv, parse_dates=None)
# print(df.dtypes)

csv.seek(0)
with timed('read_csv(…)'):
    df = read_csv(csv, parse_dates=['date'])
print(df.dtypes)

csv.seek(0)
with timed('read_csv(…)'):
    df = read_csv(csv)
    df['date'] = to_datetime(df['date'])
print(df.dtypes)

# from pandas import to_datetime

def to_datetime(col):
    for dt in col:
        pass

from pandas import read_csv, Timestamp

df = read_csv('dt2.csv', parse_dates=['date'], index_col=['date'])
df.index = df.index.map(Timestamp)

print(
    df,
    df.index,
    f'{type(df.index[0]) = }',
    sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)

from pandas import read_csv, to_datetime

df = read_csv('dt2.csv')
df['date'] = to_datetime(df['date'].str[:19]).dt.tz_localize('US/Eastern')
df = df.set_index('date')

print(
    df,
    df.index,
    f'{type(df.index[0]) = }',
    sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)

Question: How do I handle missing data?

Question: How do I handle missing data?

print("Let's take a look!")

x = 2 ** 53.

print(
    f'{x          = :,}',
    f'{x + 1      = :,}',
    f'{x == x + 1 = :}',
    sep='\n',
)

x = 2 ** 53.
y = 1
z = -(2 ** 53.)

print(
    f'{(x + y) + z = :,}',
    f'{x + (y + z) = :,}',
    sep='\n',
)

float('nan')

from pandas import Series, NA
from numpy import nan

# s1 = Series([1, 2, 3, NA])
# s2 = Series([1, 2, 3, nan])
s3 = Series([1, 2, 3, NA], dtype='Int64')

print(
    # s1,
    # s1.dtype,
    # s2,
    s3.array._data,
    s3.array._mask,
    sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)

from pandas import DataFrame

x = 2 ** 53
df = DataFrame({
    'entity': [ 'a',  'b',  'a',],
    'group':  ['g1', 'g1', 'g2',],
    'value':  [   x,  x+1,  x+2,],
})
df['value'] = df['value'].astype('Int64')

print(
    df,
    # df.groupby(['entity', 'group'])['value'].sum(),
    # df.groupby(['entity', 'group'])['value'].sum().unstack(),
    # df.groupby(['entity', 'group'])['value'].sum().unstack().fillna(0),
    # df.groupby(['entity', 'group'])['value'].sum().unstack().fillna(0).convert_dtypes(),
    # df.groupby(['entity', 'group'])['value'].sum().unstack(fill_value=0),
    df.pivot_table(
        index=['entity'],
        columns=['group'],
        values=['value'],
        aggfunc='sum',
        fill_value=0,
    ),
    sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)

from pandas import DataFrame, date_range
from numpy.random import default_rng

rng = default_rng(0)

df = DataFrame({
    'date': date_range('2020-01-01', periods=(sz := 90)),
    'value': rng.normal(size=sz).round(2),
}).set_index('date').sample(frac=.80, random_state=rng).sort_index()

print(
    # df.head(10),
    # f'{len(df) = }',
    # df.reindex(date_range('2020-01-01', periods=90)).fillna(0),
    # df.reindex(date_range('2020-01-01', periods=90)).bfill(),
    # df.reindex(date_range('2020-01-01', periods=90)).ffill(),
    # df.reindex(date_range('2020-01-01', periods=90)).interpolate(),
    # df.resample('D').interpolate(),
    # df.resample('D').ffill(),
    # df.resample('D').bfill(),
    df.resample('D').mean(),
    sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)

Commercial Break

Commercial Break

What topics do you want to hear about?

What tools or technologies are you struggling with?

What new technologies do you want to learn more about?

How can we better support your use of Python, pandas, and open source scientific computing tools in your work?

Question: How do I extract data?

Question: How do I extract data?

print("Let's take a look!")

from pandas import DataFrame

df = DataFrame({
    'entity':  ['a', 'b', 'c'],
    'details': ['x,y', 'x', 'z'],
})

print(
    df,
    # df['details'].str.contains('x'),
    # df['details'].str.split(',').explode(),
    df.assign(
        details=lambda df: df['details'].str.split(','),
    ).explode('details'),
    sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)

from pandas import DataFrame

df = DataFrame({
    'entity':  ['a', 'b', 'c'],
    'details': ['x ,y', ' x', 'Z '],
})

print(
    df.sample(3),
    df['details'].str.strip(),
    df['details'].str.strip().str.split(','),
    df['details'].str.strip().str.split(',').explode().str.strip(),
    df['details'].str.strip().str.split(',').explode().str.strip().str.lower(),
    sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)

from pandas import DataFrame
from re import compile as re_compile

df = DataFrame({
    'entity':  ['a', 'b', 'c'],
    'details': [
        'category =     xy, subcategory = y',
        'category = w',
        'category = x, subcategory = z',
    ],
})

regex = re_compile(r'\s*'.join([
    'category', '=', r'(?P<category>\w)',
    r'(?:{})?'.format(r'\s*'.join([
        ',', 'subcategory', '=', r'(?P<subcategory>\w+)',
    ]))
]))

print(
    df,
    df.set_index('entity')['details'].str.extract(regex)['category'].dropna(),
    df.set_index('entity')['details'].str.extract(regex)['subcategory'].dropna(),
    # df.join(
    #     df['details'].str.extract(regex)
    # ),
    df.join(
        df['details'].str.extract(regex)
    ),
    sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)

Question: How do I normalise entities?

Question: How do I normalise entities?

print("Let's take a look!")

from pandas import DataFrame
from numpy.random import default_rng

rng = default_rng(0)

entities = [
    'Company',
    'Company, Inc.',
    'Company, LLC',
    'Other Company',
]

df = DataFrame({
    'entity': entities,
    'value':  rng.normal(size=len(entities)).round(2),
})

print(
    df,
    df.groupby('entity').sum(),
    sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)

from pandas import DataFrame
from numpy.random import default_rng
from re import compile as re_compile

rng = default_rng(0)

entities = [
    'Company',
    'Company, Inc.',
    'Company, LLC',
    'Other Company',
]

df = DataFrame({
    'entity': entities,
    'value':  rng.normal(size=len(entities)).round(2),
})

regex = re_compile('(\w+)')
print(
    df,
    df.groupby('entity').sum(),
    df.assign(
        entity=lambda df: df['entity'].str.extract(regex)
    ),#.groupby('entity').sum(),
    sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)

from pandas import DataFrame
from numpy.random import default_rng
from re import compile as re_compile

rng = default_rng(0)

entities = [
    'Company',
    'Company, Inc.',
    'Company, LLC',
    'Other Company',
]

mapping = {
    'Company, Inc.': 'Company',
    'Company, LLC':  'Company',
}

df = DataFrame({
    'entity': entities,
    'value':  rng.normal(size=len(entities)).round(2),
})

print(
    df,
    df['entity'].map(mapping),
    sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)

from pandas import DataFrame
from numpy.random import default_rng
from re import compile as re_compile

rng = default_rng(0)

entities = [
    'Company',
    'Company, Inc.',
    'Company, LLC',
    'Other Company',
]

class passthru(dict):
    def __missing__(self, key):
        return key

mapping = passthru({
    'Company, Inc.': 'Company',
    'Company, LLC':  'Company',
})

df = DataFrame({
    'entity': entities,
    'value':  rng.normal(size=len(entities)).round(2),
})

print(
    df,
    df['entity'].map(mapping),
    sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)

from pandas import DataFrame, Series
from numpy.random import default_rng
from re import compile as re_compile

rng = default_rng(0)

entities = [
    'Company',
    'Company, Inc.',
    'Company, LLC',
    'Other Company',
]

mapping = Series(entities, index=entities)
mapping.loc['Company, Inc.'] = 'Company'
mapping.loc['Company, LLC']  = 'Company'

df = DataFrame({
    'entity': entities,
    'value':  rng.normal(size=len(entities)).round(2),
})

print(
    # df,
    df['entity'],
    # mapping.loc[df['entity']],
    # df.assign(clean_entity=mapping.loc[df['entity']].values),
    # df.assign(clean_entity=mapping.loc[df['entity']].values)
    #     .groupby('clean_entity').sum()
    # ,
    df.assign(clean_entity=mapping.loc[df['entity']].values)
        .groupby('clean_entity').agg({
            'entity': set,
            'value':  sum,
        })
    ,
    sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)

Question: How does it all come together?

Question: How does it all come together?

print("Let's take a look!")

# 1. initial data load
# 2. type conversions
#    - Categorical
#    - numeric types
#    - datetime types
# 3. data extraction
# 4. entity mapping
# 5. quality/sanity checks
# 6. dataset extraction & preparation
# 7. missing data

from pandas import read_csv, to_datetime, Categorical

df = read_csv('data.csv', index_col=[0])
df['date'] = to_datetime(df['date'])
df['price'] = df['price'].astype(float)
df['volumes'] = df['volumes'].astype(int)

df = df.rename({'volumes': 'volume'}, axis='columns')
df['industry'] = df['industry'].str.extract(r'industry = (?P<industry>\w+)')
df['ticker'] = df['ticker'].str.lower()

df['ticker'] = Categorical(df['ticker'])
df['industry'] = Categorical(df['industry'])

industry = df['industry'].set_axis(df['ticker']).groupby('ticker').first()

df = df.drop('industry', axis='columns').set_index(['date', 'ticker']).sort_index()

print(
    df.sample(3),
    # industry,
    sep='\n',
)